In [1]:
from PIL import Image
import numpy as np
import pandas as pd
import pickle

# Load Data

In [2]:
subfilename = ".npy"
X_train = np.load("data/X_train"+subfilename)
y_train = np.load("data/y_train"+subfilename)
X_test = np.load("data/X_test"+subfilename)
ids_train = np.load("data/ids_train"+subfilename)
ids_test = np.load("data/ids_test"+subfilename)
print(X_train.shape, y_train.shape, X_test.shape)
print(ids_train.shape, ids_test.shape)

(529, 224, 224, 3) (529,) (438, 224, 224, 3)
(529,) (438,)


In [3]:
FD_train_list = pickle.load(open("data/FD_train_list.pkl", "rb") )
FD_test_list = pickle.load(open("data/FD_test_list.pkl", "rb") )
print(len(FD_train_list), len(FD_test_list))

529 438


# 1-NN (handy)

# Split Has FD / Don't have FD

In [4]:
def split_with_n_without_FD(ids, X, FD_list):
    index_w_FD = []
    id_w_FD = []
    FD_list_w_FD = []
    index_wo_FD = []
    id_wo_FD = []
    X_wo_FD = []
    for i, (id_, X_, FD) in enumerate(zip(ids, X, FD_list)):
        if not FD is None:
            index_w_FD.append(i)
            id_w_FD.append(id_)
            FD_list_w_FD.append(FD)
        else:
            index_wo_FD.append(i)
            id_wo_FD.append(id_)
            X_wo_FD.append(X_)
            
    return  np.array(index_w_FD), np.array(index_wo_FD), \
            np.array(id_w_FD), np.array(FD_list_w_FD), \
            np.array(id_wo_FD), np.array(X_wo_FD)

In [5]:
%%time 
# train
index_train_w_FD, index_train_wo_FD, id_train_w_FD,  \
FD_train_list_w_FD,  id_train_wo_FD, X_train_wo_FD = \
split_with_n_without_FD(ids_train, X_train, FD_train_list)
# test
index_test_w_FD, index_test_wo_FD, id_test_w_FD,  \
FD_test_list_w_FD,  id_test_wo_FD, X_test_wo_FD = \
split_with_n_without_FD(ids_test, X_test, FD_test_list)

CPU times: user 38 ms, sys: 123 ms, total: 161 ms
Wall time: 220 ms


In [6]:
# print shape
index_train_w_FD.shape, index_train_wo_FD.shape, id_train_w_FD.shape,  \
FD_train_list_w_FD.shape,  id_train_wo_FD.shape, X_train_wo_FD.shape, \
FD_train_list_w_FD.shape, FD_test_list_w_FD.shape

((413,),
 (116,),
 (413,),
 (413, 128),
 (116,),
 (116, 224, 224, 3),
 (413, 128),
 (361, 128))

In [7]:
y_train_w_FD = y_train[index_train_w_FD]
y_train_wo_FD = y_train[index_train_wo_FD]
y_train_w_FD.shape, y_train_wo_FD.shape

((413,), (116,))

In [21]:
y_train_wo_FD

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3., 3., 3., 3., 3.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.])

# Use kNN to classify FD

In [42]:
from sklearn.neighbors import KNeighborsClassifier
cls_knn = KNeighborsClassifier()
cls_knn.fit(FD_train_list_w_FD, y_train_w_FD)
cls_knn.score(FD_train_list_w_FD, y_train_w_FD)

0.9612590799031477

In [43]:
y_pred_w_FD_knn = cls_knn.predict(FD_test_list_w_FD)

# Use Linear Model to classify FD

In [46]:
from sklearn.linear_model import RidgeClassifier
cls = RidgeClassifier()
cls.fit(FD_train_list_w_FD, y_train_w_FD)
cls.score(FD_train_list_w_FD, y_train_w_FD)

0.9709443099273608

In [47]:
y_pred_w_FD_linear = cls.predict(FD_test_list_w_FD)

In [57]:
df_op_w_FD_linear = pd.DataFrame({'id':id_test_w_FD, 'class': y_pred_w_FD_linear.astype(int) })
print("預測結果: ", df_op_w_FD_linear.groupby('class').count().to_dict()['id'])

預測結果:  {0: 52, 1: 61, 2: 59, 3: 80, 4: 109}


# Use VGGface for feature extration

In [24]:
from keras.engine import  Model
from keras.layers import Input
from keras_vggface.vggface import VGGFace

# Convolution Features
model = VGGFace(include_top=False, input_shape=(224, 224, 3), pooling='avg') 

In [37]:
vgg_features_train_wo_FD = model.predict(X_train_wo_FD)
vgg_features_test_wo_FD = model.predict(X_test_wo_FD)

In [39]:
vgg_features_train_wo_FD.shape, vgg_features_test_wo_FD.shape

((116, 512), (77, 512))

# Use kNN to classify wo_FD

In [48]:
from sklearn.neighbors import KNeighborsClassifier
cls_knn = KNeighborsClassifier()
cls_knn.fit(vgg_features_train_wo_FD, y_train_wo_FD)
cls_knn.score(vgg_features_train_wo_FD, y_train_wo_FD)

0.6637931034482759

# Use Linear Model to classify wo_FD

In [50]:
from sklearn.linear_model import RidgeClassifier
cls = RidgeClassifier()
cls.fit(vgg_features_train_wo_FD, y_train_wo_FD)
cls.score(vgg_features_train_wo_FD, y_train_wo_FD)

0.8793103448275862

In [51]:
y_pred_wo_FD_linear = cls.predict(vgg_features_test_wo_FD)

In [55]:
df_op_wo_FD_linear = pd.DataFrame({'id':id_test_wo_FD, 'class': y_pred_wo_FD_linear.astype(int) })
print("預測結果: ", df_op_wo_FD_linear.groupby('class').count().to_dict()['id'])

預測結果:  {0: 27, 1: 5, 2: 27, 3: 1, 4: 17}


# Save out

In [58]:
df_op = pd.concat((df_op_w_FD_linear, df_op_wo_FD_linear), 0)
df_op.shape

(438, 2)

In [59]:
from datetime import datetime
t_str = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S")
pth = f'submit/submission_{t_str}.csv'
df_op.to_csv(pth, index=False) 
print(f'save scv: {pth}') # acc: 0.83

save scv: submit/submission_20191007035443.csv
