## Submit 2

In [1]:
import os
from PIL import Image
import pandas as pd
import seaborn as sns
import numpy as np
from skimage.feature import hog

%matplotlib inline

## データ読み込み

In [2]:
def get_jpg_name(path):
    import os 
    img_files = os.listdir(path)
    return [jpg for jpg in img_files if ".jpg" in jpg]

basepath = "../first_retailing/"
X_train_files = get_jpg_name(basepath+"train/") #学習用
test_files  = get_jpg_name(basepath+"test/")    #予測するやつ

### 評価データ

In [3]:
y_train = pd.read_csv(basepath+"train_master.tsv",sep="\t")
#y_test = pd.read_csv(basepath+"y_test.csv")

In [4]:
y_train.head()

Unnamed: 0,file_name,category_id
0,train_0.jpg,21
1,train_1.jpg,22
2,train_2.jpg,11
3,train_3.jpg,18
4,train_4.jpg,21


In [5]:
#utils
#画像のpixelを作成
def get_img_array(path,resize):
    img_array = Image.open(path).resize([resize,resize]).getdata()
    return img_array

def get_HOG(img_array):
    return np.array( hog( img_array, orientations = 6, pixels_per_cell = (3, 3), cells_per_block = (1, 1) )) 

#精度
def M_accuracy(true, pred):
    M_acc = {}
    for i in range(24):
        y_id = pred[true["category_id"].values==i] == i
        M_acc[i]=np.mean(y_id)
    return  np.mean([j for j in M_acc.values()])

def classifaction_report_csv(report):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split('      ')
        row['class'] = row_data[0]
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    return dataframe

In [6]:
import numpy as np

"""
X_test = []
for path in y_test["file_name"]:
    X_test.append(np.hstack(get_img_array(basepath + "X_test/%s"%(path), 10)))
"""    

X_train = []
for path in y_train["file_name"]:
    X_train.append(np.hstack(get_img_array(basepath + "train/%s"%(path), 10)))

In [7]:
test = []
for path in test_files:
    test.append(np.hstack(get_img_array(basepath + "test/%s"%(path), 10)))

In [8]:
X_train = np.asarray(X_train)
#X_test = np.asarray(X_test)

In [9]:
len(np.unique(y_train["category_id"]))

24

In [10]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X_train,y_train,test_size=0.5, random_state=0)



In [11]:
#try various model in sklearn
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
import xgboost as xgb

classifiers = [
    SVC(),
    LinearSVC(),
    DecisionTreeClassifier( random_state=0 ),
    RandomForestClassifier( random_state=0 ),
    AdaBoostClassifier( random_state=0 ),
    GradientBoostingClassifier( random_state=0 ),
    KNeighborsClassifier(),
    xgb.XGBClassifier()    
                ]

def multi_clf(classifiers,X_train,y_train,X_test, y_test):
    
    # Logging for Visual Comparison
    log_cols=["Classifier", "CV_Accuracy"]
    log = pd.DataFrame(columns=log_cols)

    for clf in classifiers:
        name = clf.__class__.__name__
        print(name)
        try: 
            clf.fit(X_train,y_train["category_id"]) #using 10-fold 
            y_train_pred = clf.predict(X_train)
            y_test_pred = clf.predict(X_test)
            train_accuracy = M_accuracy(true=y_train, pred=y_train_pred)
            test_accuracy  = M_accuracy(true=y_test, pred=y_test_pred)
            print("Train Accuracy: %s\n Test Accuracy: %s"%(train_accuracy, test_accuracy))
            print("")
            log_entry = pd.DataFrame([[name, test_accuracy]], columns=log_cols)
            log = log.append(log_entry)
            
        except: 
            print("Could't Eval")
            pass
    
    return log

log_all = multi_clf(classifiers,X_train,y_train,X_test,y_test)

SVC
Train Accuracy: 1.0
 Test Accuracy: 0.042743625718

LinearSVC
Train Accuracy: 0.311785368431
 Test Accuracy: 0.181863355781

DecisionTreeClassifier
Train Accuracy: 1.0
 Test Accuracy: 0.363267232824

RandomForestClassifier
Train Accuracy: 0.979500326594
 Test Accuracy: 0.385260444102

AdaBoostClassifier
Train Accuracy: 0.108193824414
 Test Accuracy: 0.103353357486

GradientBoostingClassifier
Train Accuracy: 0.961659554494
 Test Accuracy: 0.435015423001

KNeighborsClassifier
Train Accuracy: 0.435939661146
 Test Accuracy: 0.28263432596

XGBClassifier
Train Accuracy: 0.844827475906
 Test Accuracy: 0.443575926132



In [12]:
#xgboostを使用
xgbc = xgb.XGBClassifier(seed=0)
xgbc.fit(X_train,y_train["category_id"])
test_pred = xgbc.predict(test)

In [13]:
#test_filesの順番がsample_submissionとは異なるため、変更する.
submit_dic = {}
for i,name in enumerate(test_files):
    submit_dic[name] = test_pred[i]

submit_list = [] 
for i in range(len(submit_dic)):
    submit_list.append(["test_%s.jpg"%(i),submit_dic["test_%s.jpg"%(i)]])

In [14]:
pd.DataFrame(submit_list).to_csv("submit2.csv",index=None,columns=None)

In [17]:
#RS
from sklearn.grid_search import RandomizedSearchCV
import scipy as sp

param_distributions={'max_depth': sp.stats.randint(1,11),
                     'n_estimators':sp.stats.randint(50,400),
                     'subsample': sp.stats.uniform(0.5,0.5)
}    

xgb_model = xgb.XGBClassifier(seed=0)
xgb_rs = RandomizedSearchCV(xgb_model,
                            param_distributions,
                            cv=3,              #CV数
                            n_iter=10,          #何回試すか
                            scoring="accuracy", #
                            n_jobs=1,           #使用コア数
                            verbose=0,          #表示形式
                            random_state=1)

xgb_rs.fit(X_train,y_train["category_id"])
print("Best Model Parameter: ",xgb_rs.best_params_)

Best Model Parameter:  {'max_depth': 7, 'n_estimators': 259, 'subsample': 0.65302468762759736}


In [18]:
y_train_pred =  xgb_rs.predict(X_train)
y_test_pred =  xgb_rs.predict(X_test)

train_accuracy = M_accuracy(true=y_train, pred=y_train_pred)
test_accuracy  = M_accuracy(true=y_test, pred=y_test_pred)
print("Train Accuracy: %s\n Test Accuracy: %s"%(train_accuracy, test_accuracy))

Train Accuracy: 1.0
 Test Accuracy: 0.491152936508


In [19]:
test_pred = xgb_rs.predict(test)

#test_filesの順番がsample_submissionとは異なるため、変更する.
submit_dic = {}
for i,name in enumerate(test_files):
    submit_dic[name] = test_pred[i]

submit_list = [] 
for i in range(len(submit_dic)):
    submit_list.append(["test_%s.jpg"%(i),submit_dic["test_%s.jpg"%(i)]])

pd.DataFrame(submit_list).to_csv("submit2.csv",index=None,columns=None)

## SBSアルゴリズム

In [49]:
#SBS(sequential backward selection)algorithm
#全特徴量の中から抜いても一番影響の少ない(精度が下がらない)特徴量を1つずつ抜き続ける

from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from progressbar import ProgressBar


class SBS():
    def __init__(self, estimator, k_features, scoring=accuracy_score,random_state=1):
        self.scoring = scoring             #特徴量を評価
        self.estimator = clone(estimator)  #推定器
        self.k_features = k_features       #選択する特徴量の個数
        self.random_state = random_state   #乱数種を固定するrandom_state

    def fit(self, X_train, y_train):
        dim = X_train.shape[1] #列数
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, self.indices_)
        self.scores_ = [score]
        pg = ProgressBar( maxval=dim )
        
        while dim > self.k_features:
            scores = []
            subsets = []

            #長さrのタプル列, 繰り返しを許さない組合せ
            for p in combinations(self.indices_, r=dim - 1):
                score = self._calc_score(X_train, y_train, p)
                scores.append(score)
                subsets.append(p)

            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            self.scores_.append(scores[best])
            pg.update(i+1)            
        self.k_score_ = self.scores_[-1]

        return self

    def transform(self, X):
        return X[:, self.indices_]

    def _calc_score(self, X_train, y_train, indices):
        acc= cross_val_score(self.estimator, X_train.iloc[:, indices], y_train, 
                                scoring="accuracy",cv = 3) 
        score = np.mean(acc)
        return score

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
import xgboost as xgb

print("starting evaluation of SBS")
xgbC = xgb.XGBClassifier()
sbs = SBS(xgbC,k_features=1)

print("Fit SBS")
sbs.fit(pd.DataFrame(X_train),y_train["category_id"])

print("Plot")
#特徴量の数の推移
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.tight_layout()
plt.show()

## HOG特徴量を用いたモデリング

In [6]:
#X_train_files
X_train_hog = []
for path in y_train["file_name"]:
    resize = 10
    hog_feature = get_HOG(get_img_array(basepath+"train/"+path,resize))
    if hog:
        X_train_hog.append(hog_feature)
    else:
        X_train_hog.append(np.zeros(len(X_train_hog[0])))

In [9]:
#test_files
test_hog = []
for path in test_files:
    resize = 10
    hog_feature = get_HOG(get_img_array(basepath+"test/"+path,resize))
    if hog:
        test_hog.append(hog_feature)
    else:
        test_hog.append(np.zeros(len(X_train_hog[0])))

In [10]:
X_train_hog = pd.DataFrame(X_train_hog)
test = pd.DataFrame(test_hog)

from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X_train_hog, y_train,test_size=0.5, random_state=0)



In [13]:
X_train.shape

(6199, 198)

In [23]:
#try various model in sklearn
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score

classifiers = [
    SVC(),
    LinearSVC(),
    DecisionTreeClassifier( random_state=0 ),
    RandomForestClassifier( random_state=0 ),
    AdaBoostClassifier( random_state=0 ),
    GradientBoostingClassifier( random_state=0 ),
    KNeighborsClassifier()
]

def multi_clf(classifiers,X_train,y_train,X_test, y_test):
    
    # Logging for Visual Comparison
    log_cols=["Classifier", "CV_Accuracy"]
    log = pd.DataFrame(columns=log_cols)

    for clf in classifiers:
        name = clf.__class__.__name__
        print(name)
        try: 
            clf.fit(X_train,y_train["category_id"]) #using 10-fold 
            y_train_pred = clf.predict(X_train)
            y_test_pred = clf.predict(X_test)
            train_accuracy = M_accuracy(true=y_train, pred=y_train_pred)
            test_accuracy  = M_accuracy(true=y_test, pred=y_test_pred)
            print("Train Accuracy: %s\n Test Accuracy: %s"%(train_accuracy, test_accuracy))
            print("")
            log_entry = pd.DataFrame([[name, test_accuracy]], columns=log_cols)
            log = log.append(log_entry)
            
        except: 
            print("Could't Eval")
            pass
    
    return log

log_all = multi_clf(classifiers,X_train,y_train,X_test,y_test)

SVC
Train Accuracy: 0.120342608212
 Test Accuracy: 0.116508049307

LinearSVC
Train Accuracy: 0.372871921929
 Test Accuracy: 0.194406152199

DecisionTreeClassifier
Train Accuracy: 1.0
 Test Accuracy: 0.201611882305

RandomForestClassifier
Train Accuracy: 0.985723624197
 Test Accuracy: 0.217229293693

AdaBoostClassifier
Train Accuracy: 0.089810200327
 Test Accuracy: 0.0785694568035

GradientBoostingClassifier
Train Accuracy: 0.894200142111
 Test Accuracy: 0.268846121836

KNeighborsClassifier
Train Accuracy: 0.367173779823
 Test Accuracy: 0.183691476809

