In [1]:
import numpy as np
import pandas as pd
import h5py
import time
import os
from skimage.feature import hog
from skimage.transform import resize
from skimage import io, data, color, exposure
import matplotlib.pyplot as plt
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras import backend as K

os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
data_root = "/home/centos/suhyeok/Yelp/"

In [3]:
# hog algorithm을 이용한 feature 추출 함수
def extract_hog_features(image_path):
    image = io.imread(image_path)
    image = color.rgb2gray(image)
    image_resized = resize(image, (256, 256))
    return hog(image_resized, orientations=8,
        pixels_per_cell=(16, 16), cells_per_block=(1, 1))

In [4]:
f = h5py.File(data_root+'train_image_VGGfeatures.h5','w')
filenames = f.create_dataset('photo_id',(0,), maxshape=(None,),dtype='|S54')
feature = f.create_dataset('feature',(0,2048), maxshape = (None,2048))
f.close()

In [5]:
# train image들의 이름을 배열로
train_photos = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_folder = data_root+'train_photos/'
train_images = [os.path.join(train_folder, str(x)+'.jpg') for x in train_photos['photo_id']]  # get full filename

In [6]:
train_folder

'/home/centos/suhyeok/Yelp/train_photos/'

In [7]:
train_images

['/home/centos/suhyeok/Yelp/train_photos/204149.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/52779.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/278973.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/195284.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/19992.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/80748.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/444996.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/200285.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/90572.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/27565.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/228371.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/166925.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/310868.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/13318.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/290904.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/71940.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/211449.jpg',
 '/home/centos/suhyeok/Yelp/train_photos/368909.jpg',
 '/home/centos/suhyeok/Yelp/train_p

In [8]:
num_train = len(train_images)
num_train

234842

In [9]:
tic = time.time()

# Image Training
# extract_hog_features로부터 값들을 가져와 파일에 저장
for i in range(0, num_train):
    feature = extract_hog_features(train_images[i])
    num_done = i+1
    f= h5py.File(data_root+'train_image_HOGfeatures.h5','r+')
    f['photo_id'].resize((num_done,))
    f['photo_id'][i] = train_images[i]
    f['feature'].resize((num_done,feature.shape[0]))
    f['feature'][i, :] = feature
    f.close()
    if num_done%10000==0 or num_done==num_train:
        print("Train images processed: ", num_done)

toc = time.time()
print('\nFeatures extracted in %fs' % (toc - tic))

Train images processed:  10000
Train images processed:  20000
Train images processed:  30000
Train images processed:  40000
Train images processed:  50000
Train images processed:  60000
Train images processed:  70000
Train images processed:  80000
Train images processed:  90000
Train images processed:  100000
Train images processed:  110000
Train images processed:  120000
Train images processed:  130000
Train images processed:  140000
Train images processed:  150000
Train images processed:  160000
Train images processed:  170000
Train images processed:  180000
Train images processed:  190000
Train images processed:  200000
Train images processed:  210000
Train images processed:  220000
Train images processed:  230000
Train images processed:  234842

Features extracted in 6493.085762s


In [10]:
# 파일 확인
f = h5py.File(data_root+'train_image_HOGfeatures.h5','r')
print('train_image_features.h5:')
for key in f.keys():
    print(key, f[key].shape)
    
print("\nA photo:", f['photo_id'][0])
print("Its feature vector (first 10-dim): ", f['feature'][0][0:10], " ...")
f.close()

train_image_features.h5:
feature (234842, 2048)
photo_id (234842,)

A photo: b'/home/centos/suhyeok/Yelp/train_photos/204149.jpg'
Its feature vector (first 10-dim):  [0.46312022 0.33606204 0.0680592  0.07112979 0.46312022 0.42921573
 0.22294304 0.46312022 0.22793055 0.15986058]  ...


In [11]:
f = h5py.File(data_root+'test_image_HOGfeatures.h5','w')
filenames = f.create_dataset('photo_id',(0,), maxshape=(None,),dtype='|S54')
feature = f.create_dataset('feature',(0,2048), maxshape = (None,2048))
f.close()

test_photos = pd.read_csv(data_root+'test_photo_to_biz.csv')
test_folder = data_root+'test_photos/'
test_images = [os.path.join(test_folder, str(x)+'.jpg') for x in test_photos['photo_id'].unique()]

num_test = len(test_images)
print("Number of test images: ", num_test)

tic = time.time()

# Test Images feature 추출 후 저장
for i in range(0, num_test):
    feature = extract_hog_features(test_images[i])
    num_done = i+1
    f= h5py.File(data_root+'test_image_HOGfeatures.h5','r+')
    f['photo_id'].resize((num_done,))
    f['photo_id'][i] = test_images[i]
    f['feature'].resize((num_done,feature.shape[0]))
    f['feature'][i, :] = feature
    f.close()
    if num_done%20000==0 or num_done==num_test:
        print("Test images processed: ", num_done)

toc = time.time()
print('\nFeatures extracted in %fs' % (toc - tic))  

Number of test images:  237152
Test images processed:  20000
Test images processed:  40000
Test images processed:  60000
Test images processed:  80000
Test images processed:  100000
Test images processed:  120000
Test images processed:  140000
Test images processed:  160000
Test images processed:  180000
Test images processed:  200000
Test images processed:  220000
Test images processed:  237152

Features extracted in 6681.682597s


In [12]:
# feature저장한 파일 확인
f = h5py.File(data_root+'test_image_HOGfeatures.h5','r')
print('test_image_features.h5:')
for key in f.keys():
    print(key, f[key].shape)
    
print("\nA photo:", f['photo_id'][0])
print("Its feature vector (first 10-dim): ", f['feature'][0][0:10], " ...")
f.close()

test_image_features.h5:
feature (237152, 2048)
photo_id (237152,)

A photo: b'/home/centos/suhyeok/Yelp/test_photos/317818.jpg'
Its feature vector (first 10-dim):  [0.46157387 0.12918824 0.2204526  0.46157387 0.46157387 0.1404648
 0.2505573  0.46157387 0.5138341  0.17439725]  ...


In [13]:
# business와 label 분리
train_photo_to_biz = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_labels = pd.read_csv(data_root+'train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
biz_ids = train_labels.index.unique()
print("Number of business: ", len(biz_ids) ,   "(4 business with missing labels are dropped)")

Number of business:  1996 (4 business with missing labels are dropped)


In [14]:
f = h5py.File(data_root+'train_image_HOGfeatures.h5','r')
train_image_features = np.copy(f['feature'])
f.close()

In [15]:
t= time.time()
## business 별로 feature vector와 Compute
df = pd.DataFrame(columns=['business','label','feature vector'])
index = 0
for biz in biz_ids:  
    
    label = train_labels.loc[biz]['labels']
    image_index = train_photo_to_biz[train_photo_to_biz['business_id']==biz].index.tolist()
    folder = data_root+'train_photo_folders/'  
    
    features = train_image_features[image_index]
    mean_feature =list(np.mean(features,axis=0))

    df.loc[index] = [biz, label, mean_feature]
    index+=1
    if index%1000==0:
        print("Buisness processed: ", index, "Time passed: ", "{0:.1f}".format(time.time()-t), "sec")

Buisness processed:  1000 Time passed:  3.7 sec


In [16]:
with open(data_root+"train_biz_HOGfeatures.csv",'w') as f:  
    df.to_csv(f, index=False)

In [17]:
# 파일 내용 확인
train_business = pd.read_csv(data_root+'train_biz_HOGfeatures.csv')
print(train_business.shape)
train_business[0:5]

(1996, 3)


Unnamed: 0,business,label,feature vector
0,1000,"(1, 2, 3, 4, 5, 6, 7)","[0.37202442, 0.27358377, 0.28332734, 0.3404962..."
1,1001,"(0, 1, 6, 8)","[0.35575873, 0.22162527, 0.2609377, 0.3123159,..."
2,100,"(1, 2, 4, 5, 6, 7)","[0.35883895, 0.26916155, 0.29552996, 0.3712243..."
3,1006,"(1, 2, 4, 5, 6)","[0.3719206, 0.27774164, 0.31123084, 0.3418225,..."
4,1010,"(0, 6, 8)","[0.35154918, 0.24722019, 0.29202586, 0.3061635..."


In [18]:
test_photo_to_biz = pd.read_csv(data_root+'test_photo_to_biz.csv')
biz_ids = test_photo_to_biz['business_id'].unique()

In [20]:
## Load image features
f = h5py.File(data_root+'test_image_HOGfeatures.h5','r')
image_filenames = list(np.copy(f['photo_id']))
image_filenames = [name.decode().split('/')[-1][:-4] for name in image_filenames]  #remove the full path and the str ".jpg"
image_features = np.copy(f['feature'])
f.close()
print("Number of business: ", len(biz_ids))

Number of business:  10000


In [21]:
df = pd.DataFrame(columns=['business','feature vector'])
index = 0

In [22]:
t = time.time()

# business id별로 test 파일에서 extract한 feature vector compute
for biz in biz_ids:     
    
    image_ids = test_photo_to_biz[test_photo_to_biz['business_id']==biz]['photo_id'].tolist()  
    image_index = [image_filenames.index(str(x)) for x in image_ids]
     
    folder = data_root+'test_photo_folders/'            
    features = image_features[image_index]
    mean_feature =list(np.mean(features,axis=0))

    df.loc[index] = [biz, mean_feature]
    index+=1
    if index%1000==0:
        print("Buisness processed: ", index, "Time passed: ", "{0:.1f}".format(time.time()-t), "sec")

Buisness processed:  1000 Time passed:  210.4 sec
Buisness processed:  2000 Time passed:  571.9 sec
Buisness processed:  3000 Time passed:  951.6 sec
Buisness processed:  4000 Time passed:  1400.8 sec
Buisness processed:  5000 Time passed:  1843.1 sec
Buisness processed:  6000 Time passed:  2266.0 sec
Buisness processed:  7000 Time passed:  2701.2 sec
Buisness processed:  8000 Time passed:  3126.4 sec
Buisness processed:  9000 Time passed:  3559.8 sec
Buisness processed:  10000 Time passed:  4018.2 sec


In [23]:
with open(data_root+"test_biz_HOGfeatures.csv",'w') as f:  
    df.to_csv(f, index=False)

In [24]:
# 파일 내용 확인
test_business = pd.read_csv(data_root+'test_biz_HOGfeatures.csv')
print(test_business.shape)
test_business[0:5]

(10000, 2)


Unnamed: 0,business,feature vector
0,003sg,"[0.3535326, 0.29635066, 0.30970886, 0.36620972..."
1,00er5,"[0.3510624, 0.2904297, 0.32097432, 0.36438936,..."
2,00kad,"[0.34746814, 0.3214451, 0.32141566, 0.34214893..."
3,00mc6,"[0.3389741, 0.3378496, 0.36262426, 0.37983507,..."
4,00q7x,"[0.3292041, 0.26734918, 0.35585466, 0.37005726..."


In [25]:
train_photos = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_photo_to_biz = pd.read_csv(data_root+'train_photo_to_biz_ids.csv', index_col='photo_id')

In [27]:
train_df = pd.read_csv(data_root+"train_biz_HOGfeatures.csv")
test_df  = pd.read_csv(data_root+"test_biz_HOGfeatures.csv")

In [28]:
y_train = train_df['label'].values
X_train = train_df['feature vector'].values
X_test = test_df['feature vector'].values

In [29]:
# 라벨 배열로 변환
def convert_label_to_array(str_label):
    str_label = str_label[1:-1]
    str_label = str_label.split(',')
    return [int(x) for x in str_label if len(x)>0]

In [30]:
# feature 배열 vector로 변환
def convert_feature_to_vector(str_feature):
    str_feature = str_feature[1:-1]
    str_feature = str_feature.split(',')
    return [float(x) for x in str_feature]

In [31]:
# 함수 호출, y_train, X_train, X_test 정의
y_train = np.array([convert_label_to_array(y) for y in train_df['label']])
X_train = np.array([convert_feature_to_vector(x) for x in train_df['feature vector']])
X_test = np.array([convert_feature_to_vector(x) for x in test_df['feature vector']])

  """Entry point for launching an IPython kernel.


In [32]:
print("X_train: ",X_train.shape)
print("y_train: ",y_train.shape)
print("X_test: ",X_test.shape)
print("train_df:")
train_df[:5]

X_train:  (1996, 2048)
y_train:  (1996,)
X_test:  (10000, 2048)
train_df:


Unnamed: 0,business,label,feature vector
0,1000,"(1, 2, 3, 4, 5, 6, 7)","[0.37202442, 0.27358377, 0.28332734, 0.3404962..."
1,1001,"(0, 1, 6, 8)","[0.35575873, 0.22162527, 0.2609377, 0.3123159,..."
2,100,"(1, 2, 4, 5, 6, 7)","[0.35883895, 0.26916155, 0.29552996, 0.3712243..."
3,1006,"(1, 2, 4, 5, 6)","[0.3719206, 0.27774164, 0.31123084, 0.3418225,..."
4,1010,"(0, 6, 8)","[0.35154918, 0.24722019, 0.29202586, 0.3061635..."


In [33]:
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [34]:
t=time.time()
# split하고 SVM을 이용하여 모델 구성
mlb = MultiLabelBinarizer()
y_ptrain= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_ptrain, test_size=.2,random_state=random_state)
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, gamma=5))
classifier.fit(X_ptrain, y_ptrain)

y_ppredict = classifier.predict(X_ptest)

print("Time passed: ", "{0:.1f}".format(time.time()-t), "sec")

Time passed:  134.9 sec


In [35]:
print("Samples of predicted labels (in binary matrix):\n", y_ppredict[0:3])
print("\nSamples of predicted labels:\n", mlb.inverse_transform(y_ppredict[0:3]))

Samples of predicted labels (in binary matrix):
 [[0 1 1 1 0 1 0 0 1]
 [0 1 1 1 0 1 1 0 1]
 [0 1 1 0 0 1 1 0 1]]

Samples of predicted labels:
 [(1, 2, 3, 5, 8), (1, 2, 3, 5, 6, 8), (1, 2, 5, 6, 8)]


In [36]:
# attribute별 label들의 biz 구성 비율
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_ppredict, axis=0), len(y_ppredict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_ppredict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,68,182,196,186,86,282,300,82,279,400
biz ratio,17%,46%,49%,46%,22%,70%,75%,20%,70%,100%


In [37]:
from sklearn.metrics import f1_score

print("F1 score: ", f1_score(y_ptest, y_ppredict, average='micro')) 
print("Individual Class F1 score: ", f1_score(y_ptest, y_ppredict, average=None))

F1 score:  0.708445240226798
Individual Class F1 score:  [0.43478261 0.69680851 0.7628866  0.51733333 0.58947368 0.77443609
 0.85361552 0.51546392 0.81226054]


In [38]:
t = time.time()

mlb = MultiLabelBinarizer()
y_train= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, gamma=5))
classifier.fit(X_train, y_train)

y_predict = classifier.predict(X_test)

#print list(mlb.classes_)
y_predict_label = mlb.inverse_transform(y_predict) #Convert binary matrix back to labels

print("Time passed: ", "{0:.1f}".format(time.time()-t), "sec")

Time passed:  414.2 sec


In [39]:
X_test.shape

(10000, 2048)

In [40]:
test_data_frame  = pd.read_csv(data_root+"test_biz_HOGfeatures.csv")
df = pd.DataFrame(columns=['business_id','labels'])

In [41]:
for i in range(len(test_data_frame)):
    biz = test_data_frame.loc[i]['business']
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]

In [42]:
with open(data_root+"submission_HOG.csv",'w') as f:
    df.to_csv(f, index=False)

In [43]:
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_predict, axis=0), len(y_predict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_predict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,831,6547,7056,3919,1478,8624,9087,1417,6995,10000
biz ratio,8%,65%,71%,39%,15%,86%,91%,14%,70%,100%
