# DAY3で使用したデータセットの準備

In [1]:
# 必要ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
# 主成分分析
from sklearn.decomposition import PCA
# k-最近傍法
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# 汎化性能算出用の共通関数
def calc_general_perf(y_est, y_test):
    accuracy = accuracy_score(y_test, y_est)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_est)

    print('正解率 = {:.3f}'.format(accuracy * 100))
    print('適合率(precision) = {:.3f}'.format(precision[1] * 100))
    print('再現率(recall) = {:.3f}'.format(recall[1] * 100))
    print('F1値(F1-score) = {:.3f}'.format(f1_score[1] * 100))

In [3]:
# データの読み込み
df_kick_org = pd.read_csv('./data/ks-projects-201801.csv')

In [4]:
# stateがsuccessfulまたはfailedとなるレコードの抽出
df_kick = df_kick_org.loc[(df_kick_org['state'] == 'successful') | (df_kick_org['state'] == 'failed'), :]

df_kick.loc[df_kick['state'] == 'failed', 'state'] = 0
df_kick.loc[df_kick['state'] == 'successful', 'state'] = 1

display(df_kick)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,0,15,US,100.0,2421.0,30000.00
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,0,3,US,220.0,220.0,45000.00
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,0,1,US,1.0,1.0,5000.00
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,1,224,US,52375.0,52375.0,50000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378654,999975836,"Homemade fresh dog food, Cleveland OH",Small Batch,Food,USD,2017-04-19,6500.0,2017-03-20 22:08:22,154.0,0,4,US,0.0,154.0,6500.00
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,0,5,US,155.0,155.0,1500.00
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,0,1,US,20.0,20.0,15000.00
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,0,6,US,200.0,200.0,15000.00


In [5]:
# nameの欠損値を埋める
df_kick.loc[:, 'name'] = df_kick['name'].fillna('')
df_kick.isnull().sum()

ID                    0
name                  0
category              0
main_category         0
currency              0
deadline              0
goal                  0
launched              0
pledged               0
state                 0
backers               0
country               0
usd pledged         210
usd_pledged_real      0
usd_goal_real         0
dtype: int64

# nameに含まれる単語から成功率を算出する

In [6]:
df_kick['split_name'] = df_kick['name'].str.split()

# nameに特定の単語が含まれていることを示す説明変数を追加する
def contains_words(df, column, words):
    words_array = []

    def extend_array(array):
        # wordsの単語リストが含まれていれば1, そうでなければ0
        words_array.append(0 if set(words).isdisjoint(set(array)) else 1)
        
    df['split_name'].apply(extend_array)
    df.loc[:, column] = words_array

# Art/artを含むかどうかを示す説明変数
contains_words(df_kick, 'name_art', ['Art', 'art'])

# Project/projectを含むかどうかを示す説明変数
contains_words(df_kick, 'name_project', ['Project', 'project'])

# New/new/Debut/debut/First/firstを含むかどうかを示す説明変数
contains_words(df_kick, 'name_new', ['New', 'new', 'Debut', 'debut', 'First', 'first'])

# Album/albumを含むかどうかを示す説明変数
contains_words(df_kick, 'name_album', ['Album', 'album'])

# Film/firmを含むかどうかを示す説明変数
contains_words(df_kick, 'name_film', ['Film', 'film'])

# Book/bookを含むかどうかを示す説明変数
contains_words(df_kick, 'name_book', ['Book', 'book'])

# Game/gameを含むかどうかを示す説明変数
contains_words(df_kick, 'name_game', ['Game', 'game'])

# Music/musicを含むかどうかを示す説明変数
contains_words(df_kick, 'name_music', ['Music', 'music'])

# Help/helpを含むかどうかを示す説明変数
contains_words(df_kick, 'name_help', ['Help', 'help'])

# Short/shortを含むかどうかを示す説明変数
contains_words(df_kick, 'name_short', ['Short', 'short'])

# Series/seriesを含むかどうかを示す説明変数
contains_words(df_kick, 'name_series', ['Series', 'series'])

# Documentary/documentaryを含むかどうかを示す説明変数
contains_words(df_kick, 'name_documentary', ['Documentary', 'documentary'])

# Life/lifeを含むかどうかを示す説明変数
contains_words(df_kick, 'name_life', ['Life', 'life'])

# World/worldを含むかどうかを示す説明変数
contains_words(df_kick, 'name_world', ['World', 'world'])

# Video/videoを含むかどうかを示す説明変数
contains_words(df_kick, 'name_video', ['Video', 'video'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)


In [7]:
df_result = pd.DataFrame({'成功率': [], '失敗率': [], '成功と失敗の差': []})

def disp_succeed_ratio(df, columns):
    for column in columns:
#        print('=== 失敗／成功に対する{:s}を含む／含まないそれぞれの割合 ==='.format(column))
#        print(df.groupby('state')[column].value_counts(normalize = True))
        
        df_contains = df.loc[df[column] == 1]
        
        total = df_contains.shape[0]
        success = df_contains.loc[df_contains['state'] == 1].shape[0]
        fail = df_contains.loc[df_contains['state'] == 0].shape[0]
        
        succeed_ratio = success / total
        fail_ratio = fail / total
#        print('{:s}を含む場合の成功率={:.5f}, 失敗率={:.5f}'.format(column, succeed_ratio, fail_ratio))
        df_result.loc[column] = {'成功率': succeed_ratio, '失敗率': fail_ratio, 
                                 '成功と失敗の差': np.abs(succeed_ratio - fail_ratio)}

# nameに含まれる単語から作成した説明変数から判断できる成功率
disp_succeed_ratio(df_kick, ['name_art', 'name_project', 'name_new', 'name_album', 
                             'name_film', 'name_book', 'name_music', 'name_help', 
                             'name_short', 'name_series', 'name_documentary', 'name_life', 
                             'name_world', 'name_video'])

display(df_result)

Unnamed: 0,成功率,失敗率,成功と失敗の差
name_art,0.434661,0.565339,0.130677
name_project,0.403833,0.596167,0.192334
name_new,0.554953,0.445047,0.109907
name_album,0.592368,0.407632,0.184736
name_film,0.559831,0.440169,0.119662
name_book,0.445056,0.554944,0.109887
name_music,0.416593,0.583407,0.166815
name_help,0.444721,0.555279,0.110558
name_short,0.62289,0.37711,0.245779
name_series,0.369559,0.630441,0.260883


成功と失敗の差が大きい説明変数が、成功または失敗を特徴づける変数として有効と考えられる。  
今回は、成功と失敗の差が0.15以上のものを選択する。
* name_project
* name_album
* name_music
* name_short
* name_series
* name_life
* name_world
* name_video

# その他の説明変数の準備

In [8]:
# 募集期間periodの作成
df_kick.loc[:, 'deadline_dt'] = pd.to_datetime(df_kick['deadline'])
df_kick.loc[:, 'launched_dt'] = pd.to_datetime(df_kick['launched'])

df_kick.loc[:, 'period_dt'] = df_kick['deadline_dt'] - df_kick['launched_dt']
df_kick.loc[:, 'period'] = df_kick['period_dt'].dt.days

# nameの文字数をname_lengthという説明変数にする
df_kick.loc[:, 'name_length'] = df_kick['name'].str.len()

# name_lengthの欠損値に0を入れる
df_kick.loc[:, 'name_length'] = df_kick['name_length'].fillna(0)

# usd_goal_realを対数変換する
df_kick.loc[:, 'usd_goal_ln'] = np.log10(df_kick.loc[:, 'usd_goal_real'])

# main_category, currencyをone-hot-vector化する
df_kick = pd.get_dummies(df_kick[['state', 'main_category', 'currency', 'usd_goal_ln', 'period', 
                                  'name_length', 'name_project', 'name_album', 'name_music',
                                  'name_short', 'name_series', 'name_life', 'name_world', 'name_video']])

display(df_kick.head())

Unnamed: 0,state,usd_goal_ln,period,name_length,name_project,name_album,name_music,name_short,name_series,name_life,...,currency_EUR,currency_GBP,currency_HKD,currency_JPY,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD
0,0,3.185811,58,31,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,4.477121,59,45,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,4.653213,44,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,3.69897,29,49,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,1,4.69897,34,20,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
# ホールドアウト法で訓練データと検証データを4:1に分割
from sklearn.model_selection import train_test_split

X_df = df_kick[df_kick.columns[1:]]
y_df = pd.Series(df_kick['state'])

test_size = 0.2

X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(
    X_df, y_df, test_size = test_size, random_state = 1234)

In [10]:
# 説明変数の標準化
import sys
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()

# 訓練データの標準化
X_df_train.loc[:, ['usd_goal_ln', 'period', 'name_length']] = stdsc.fit_transform(
    X_df_train[['usd_goal_ln', 'period', 'name_length']].values)

# 検証データの標準化
# 訓練データの平均と標準偏差を利用して標準化を行う→fit_transformではなく、transformを使う
X_df_test.loc[:, ['usd_goal_ln', 'period', 'name_length']] = stdsc.transform(
    X_df_test[['usd_goal_ln', 'period', 'name_length']].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [11]:
from sklearn.feature_selection import RFECV

# L2正則化項のハイパーパラメータはDay2で探索した最適値を利用する
# モデルによって特徴選択結果は変わる、線形分類が適切かどうかはわからない
estimator = SGDClassifier(loss = 'log', fit_intercept = True, penalty = 'l2', alpha = 1e-3,
                          random_state = 1234, max_iter = 10000, tol = 1e-3)

# ステップワイズ法＋交差検証法で特徴選択を行う
rfecv = RFECV(estimator, cv = 5, scoring = 'accuracy', n_jobs = 3)

X_train = X_df_train.values
y_train = y_df_train.values

rfecv.fit(X_train, y_train)

RFECV(cv=5,
      estimator=SGDClassifier(alpha=0.001, average=False, class_weight=None,
                              early_stopping=False, epsilon=0.1, eta0=0.0,
                              fit_intercept=True, l1_ratio=0.15,
                              learning_rate='optimal', loss='log',
                              max_iter=10000, n_iter_no_change=5, n_jobs=None,
                              penalty='l2', power_t=0.5, random_state=1234,
                              shuffle=True, tol=0.001, validation_fraction=0.1,
                              verbose=0, warm_start=False),
      min_features_to_select=1, n_jobs=3, scoring='accuracy', step=1,
      verbose=0)

In [12]:
# 削除してもよい特徴量を削除
remove_idx = ~rfecv.support_

remove_feature = X_df_train.columns[remove_idx]
print("削除する特徴量")
print(remove_feature)

X_df_train = X_df_train.drop(remove_feature, axis = 1)
X_df_test = X_df_test.drop(remove_feature, axis = 1)
print("残った特徴量")
print(X_df_train.columns)

削除する特徴量
Index(['main_category_Art', 'currency_CAD', 'currency_CHF', 'currency_DKK',
       'currency_JPY', 'currency_NOK', 'currency_NZD', 'currency_SEK',
       'currency_SGD'],
      dtype='object')
残った特徴量
Index(['usd_goal_ln', 'period', 'name_length', 'name_project', 'name_album',
       'name_music', 'name_short', 'name_series', 'name_life', 'name_world',
       'name_video', 'main_category_Comics', 'main_category_Crafts',
       'main_category_Dance', 'main_category_Design', 'main_category_Fashion',
       'main_category_Film & Video', 'main_category_Food',
       'main_category_Games', 'main_category_Journalism',
       'main_category_Music', 'main_category_Photography',
       'main_category_Publishing', 'main_category_Technology',
       'main_category_Theater', 'currency_AUD', 'currency_EUR', 'currency_GBP',
       'currency_HKD', 'currency_MXN', 'currency_USD'],
      dtype='object')


In [13]:
print('次元数={:d}'.format(X_df_train.shape[1]))

次元数=31


# 特徴選択後の説明変数でロジスティック回帰を実施
* ハイパーパラメータ$\alpha$をランダムサーチで探索する

In [14]:
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV

n_split = 5 # 分割数

accuracy_total = 0.0
split_num = 1

X_train = X_df_train.values
y_train = y_df_train.values

# ロジスティック回帰モデルで訓練性能を交差検証で算出する
# L2正則化項のハイパーパラメータalphaの値は説明変数が変わった場合は、再度探索した方がよい
params = {'alpha' : stats.expon(scale = 1e-3)}
clf = RandomizedSearchCV(SGDClassifier(loss = 'log', fit_intercept = True, penalty = 'l2', 
                                 random_state = 1234, max_iter = 10000, tol = 1e-3),
                  params, cv = 5, n_jobs = 3)
clf.fit(X_train, y_train)
print("score={:.3f}".format(clf.best_score_ * 100))
print("best_params={:}".format(clf.best_params_))

score=65.580
best_params={'alpha': 0.0007274594464945687}


In [15]:
# 汎化性能を算出
X_test = X_df_test.values
y_test = y_df_test.values

y_est = clf.predict(X_test)
calc_general_perf(y_est, y_test)

正解率 = 65.735
適合率(precision) = 60.379
再現率(recall) = 42.384
F1値(F1-score) = 49.806


# 決定木のハイパーパラメータをランダムサーチで探索する
ランダムサーチに指定する確率密度関数として指数分布 $\lambda * \exp{(- \frac{x}{\lambda})}$ を用いる。

$\lambda$ は期待値で、ランダムサーチの探索範囲の中央値付近になるように指定する。  
グリッドサーチ探索時にmax_depthは10が最適値だったので、$\lambda = 10$とする指数分布を探索時に指定する。

min_samples_splitとmin_samples_leafはグリッドサーチで探索する。

※ハイパーパラメータの探索は3つ指定したうちの中央値が選択されるところまで実施するのがよい

In [16]:
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

X_train = X_df_train.values
y_train = y_df_train.values

# min_samples_splitやmin_samples_leafも探索した方がよい
params = {"criterion": ["gini", "entropy"], "max_depth": stats.expon(scale = 13),
         "min_samples_split": range(2, 11, 1),
         "min_samples_leaf": range(5, 15, 1)}
clf = RandomizedSearchCV(DecisionTreeClassifier(random_state = 1234),
                   params, cv = 5, n_iter = 100, n_jobs = 3, random_state = 1234)
clf.fit(X_train, y_train)
print("score={:.3f}".format(clf.best_score_ * 100))
print("best_params={:}".format(clf.best_params_))

score=65.784
best_params={'criterion': 'entropy', 'max_depth': 14.325722671024236, 'min_samples_leaf': 14, 'min_samples_split': 5}


In [17]:
# 最適なハイパーパラメータを用いて決定木で学習を行う
clf = DecisionTreeClassifier(criterion = "gini", max_depth = 14.325722671024236, random_state = 1234,
                            min_samples_split = 5, min_samples_leaf = 14)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini',
                       max_depth=14.325722671024236, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=14,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=1234, splitter='best')

In [18]:
# 検証データの汎化性能を確認する
y_est = clf.predict(X_test)

calc_general_perf(y_est, y_test)

正解率 = 65.915
適合率(precision) = 60.011
再現率(recall) = 45.026
F1値(F1-score) = 51.449


# ランダムフォレストのハイパーパラメータ探索
ランダムサーチでランダムフォレストのハイパーパラメータを探索する。

In [20]:
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

X_train = X_df_train.values
y_train = y_df_train.values

params = {"criterion": ["gini", "entropy"], "max_depth": stats.expon(scale = 23),
         "min_samples_split": range(8, 15, 1),
         "min_samples_leaf": range(2, 5, 1)}
clf = RandomizedSearchCV(RandomForestClassifier(n_estimators = 100, random_state = 1234, n_jobs = 6),
                        params, cv = 5, n_jobs = 6, n_iter = 20, random_state = 1234)
clf.fit(X_train, y_train)
print("score={:.3f}".format(clf.best_score_ * 100))
print("best_params={:}".format(clf.best_params_))

score=67.272
best_params={'criterion': 'gini', 'max_depth': 28.02039107282901, 'min_samples_leaf': 3, 'min_samples_split': 13}


In [21]:
# 最適なハイパーパラメータを用いてランダムフォレストで学習を行う
clf = RandomForestClassifier(n_estimators = 100, random_state = 1234,
                            criterion = "gini", max_depth = 28.02039107282901,
                            min_samples_split = 13, min_samples_leaf = 3, n_jobs = 6)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=28.02039107282901, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=3,
                       min_samples_split=13, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=6, oob_score=False,
                       random_state=1234, verbose=0, warm_start=False)

In [22]:
# 検証データで汎化性能を確認する
X_test = X_df_test.values
y_test = y_df_test.values

y_est = clf.predict(X_test)
calc_general_perf(y_est, y_test)

正解率 = 67.512
適合率(precision) = 62.598
再現率(recall) = 47.209
F1値(F1-score) = 53.826


# 説明変数の重要度を算出

In [23]:
df_feature = pd.DataFrame({'RF説明変数重要度': []})
for i in range(X_df_train.shape[1]):
    df_feature.loc[X_df_train.columns[i]] = {'RF説明変数重要度': clf.feature_importances_[i]}
    
display(df_feature.sort_values('RF説明変数重要度', ascending = False))

Unnamed: 0,RF説明変数重要度
usd_goal_ln,0.341318
period,0.197747
name_length,0.165547
main_category_Music,0.030794
main_category_Theater,0.027559
main_category_Technology,0.027202
main_category_Fashion,0.019881
main_category_Comics,0.01906
main_category_Food,0.014681
currency_USD,0.014072


以下の3つの変数の合計で重要度の70％を占める。  
3つの変数でほぼ説明ができる？
* usd_goal_ln
* period
* name_length

## 主成分分析の寄与率とランダムフォレストの説明変数重要度と比較する

In [24]:
X_train = np.array(X_df_train)
X_train

array([[-0.16995903, -0.31047727,  1.42930202, ...,  0.        ,
         0.        ,  1.        ],
       [-1.5401285 , -1.09782535, -1.76867463, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.92602104,  0.87054484,  0.30060438, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.16472064,  1.97283214,  0.92765863, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.29479584, -0.31047727, -0.82809326, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02049305, -0.6254165 ,  0.99036405, ...,  0.        ,
         0.        ,  1.        ]])

In [25]:
pca = PCA(n_components = 31)
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=31, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [29]:
df_feature = pd.DataFrame({'RF説明変数重要度': [], '寄与率': []})

print("合計=", pca.explained_variance_ratio_.sum())
for i in range(X_df_train.shape[1]):
    df_feature.loc[X_df_train.columns[i]] = {'RF説明変数重要度': clf.feature_importances_[i],
                                             '寄与率': pca.explained_variance_ratio_[i]}
    
display(df_feature.sort_values('RF説明変数重要度', ascending = False))

合計= 1.0


Unnamed: 0,RF説明変数重要度,寄与率
usd_goal_ln,0.341318,0.280574
period,0.197747,0.233804
name_length,0.165547,0.191882
main_category_Music,0.030794,0.003273
main_category_Theater,0.027559,0.002273
main_category_Technology,0.027202,0.002572
main_category_Fashion,0.019881,0.006334
main_category_Comics,0.01906,0.013731
main_category_Food,0.014681,0.005735
currency_USD,0.014072,0.000335


ランダムフォレストの説明変数重要度Top3は、主成分分析の第3主成分までと一致している。  
第4主成分以降は必ずしも一致しない。

# アダブーストのハイパーパラメータ探索
ランダムサーチでアダブーストのハイパーパラメータを探索する

In [32]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

X_train = X_df_train.values
y_train = y_df_train.values

# 決定木のハイパーパラメータをランダムサーチで探索する
params = {"base_estimator__criterion": ["gini", "entropy"], 
          "base_estimator__max_depth": stats.expon(scale = 28),
          "base_estimator__min_samples_split": range(10, 17, 1), 
          "base_estimator__min_samples_leaf": range(2, 6, 1)}
clf = RandomizedSearchCV(AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(random_state = 1234),
    n_estimators = 100, random_state = 1234), params, cv = 5, n_jobs = 6)
clf.fit(X_train, y_train)
print("score={:.3f}".format(clf.best_score_ * 100))
print("best_params={:}".format(clf.best_params_))

score=65.987
best_params={'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 7.163517498109106, 'base_estimator__min_samples_leaf': 2, 'base_estimator__min_samples_split': 13}


In [34]:
# 最適なハイパーパラメータを用いて学習を行う
clf = AdaBoostClassifier(DecisionTreeClassifier(random_state = 1234, max_depth = 7.163517498109106,
                                               criterion = "gini",
                                               min_samples_split = 13, min_samples_leaf = 2),
                        n_estimators = 100, random_state = 1234)
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=7.163517498109106,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=2,
                                                         min_samples_split=13,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=1234,
            

In [35]:
# 検証データで汎化性能を確認する
X_test = X_df_test.values
y_test = y_df_test.values

y_est = clf.predict(X_test)

calc_general_perf(y_est, y_test)

正解率 = 66.511
適合率(precision) = 59.879
再現率(recall) = 50.028
F1値(F1-score) = 54.512


# k-最近傍法で分類を行う
* k個の多数決で決定する手法であるため、kはハイパーパラメータとなる
* 最初にkをグリッドサーチで探索し、最適に近いパラメータを絞り込む
* グリッドサーチで絞り込んだ最適値を中央値とした乱数生成でランダムサーチを行い、kの最適値を確定する

In [41]:
from sklearn.model_selection import GridSearchCV

X_train = X_df_train.values
y_train = y_df_train.values

# まずはグリッドサーチで最適値を探す
params = {'n_neighbors': [100, 200, 400]}
clf = GridSearchCV(KNeighborsClassifier(n_jobs = 6), params, cv = 5, n_jobs = 6)
clf.fit(X_train, y_train)
print("score={:.3f}".format(clf.best_score_ * 100))
print("best_params={:}".format(clf.best_params_))

score=66.482
best_params={'n_neighbors': 200}


In [44]:
# n_neighbors = 200を中央値とする乱数生成でランダムサーチで最適値を探す
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

params = {'n_neighbors': stats.randint(190, 210)}
clf = RandomizedSearchCV(KNeighborsClassifier(n_jobs = 6), params, cv = 5, n_jobs = 6)
clf.fit(X_train, y_train)
print("score={:.3f}".format(clf.best_score_ * 100))
print("best_params={:}".format(clf.best_params_))

score=66.495
best_params={'n_neighbors': 193}


In [46]:
# n_neighborsの最適値を使用して、訓練データを学習させる
clf = KNeighborsClassifier(n_neighbors = 193, n_jobs = 3)
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=3, n_neighbors=193, p=2,
                     weights='uniform')

In [47]:
# 検証データで汎化性能を確認する
X_test = X_df_test.values
y_test = y_df_test.values

y_est = clf.predict(X_test)
calc_general_perf(y_est, y_test)

正解率 = 66.799
適合率(precision) = 61.443
再現率(recall) = 46.243
F1値(F1-score) = 52.771


## 全結合型ニューラルネットワークのハイパーパラメータチューニング
計算時間がかかるので、グリッドサーチで行う。

In [48]:
# データの準備
from tensorflow.keras.utils import to_categorical

X_train = X_df_train.values
y_train = y_df_train.values
X_test = X_df_test.values
y_test = y_df_test.values

# one-hotベクトルに変換
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [53]:
from tensorflow.compat.v1.keras import Sequential
from tensorflow.compat.v1.keras.layers import Dense, Dropout, Activation
from tensorflow.compat.v1.keras.optimizers import SGD, RMSprop, Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

input_dim = X_df_train.shape[1]

# ニューラルネットワークモデル構築関数
def NNmodel(activation = 'relu', optimizer = 'sgd', out_dim = 5, input_dim = input_dim):
    model = Sequential()
    # 入力層の次元はX_trainのカラム数に合わせること
    model.add(Dense(out_dim, activation = activation, input_dim = input_dim))
    model.add(Dense((out_dim - 1), activation = activation))
    model.add(Dense(2, activation='softmax'))#最終層のactivationは変更しないこと

    # 最適化手法
    # ADAMまたはRMSPropを使えるようにしておく
    #sgd = SGD(lr=0.01, momentum=0.9, nesterov=False)
    if optimizer == 'rmsprop':
        selected_optimizer = RMSprop(lr=0.01)
    elif optimizer == 'adam':
        selected_optimizer = Adam(lr=0.01)
    elif optimizer == 'sgd':
        selected_optimizer = SGD(lr=0.01, momentum=0.9, nesterov=False)
        
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = selected_optimizer,
                  metrics = ['accuracy'])
    return model

model = KerasClassifier(build_fn = NNmodel, verbose = 1)
param_grid = {
    'activation': ['relu', 'tanh'],
    'out_dim': [5, 10],
    'nb_epoch': [20, 40],
    'batch_size': [10, 20]
}
clf = GridSearchCV(estimator = model, param_grid = param_grid, cv = 5, n_jobs = 3)
clf.fit(X_train, y_train)
print("score={:.3f}".format(clf.best_score_ * 100))
print("best_params={:}".format(clf.best_params_))



Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
score=66.159
best_params={'activation': 'relu', 'batch_size': 10, 'nb_epoch': 20, 'out_dim': 10}


In [56]:
# 最適なハイパーパラメータで汎化性能を確認する
best_estimator = clf.best_estimator_

y_pred = best_estimator.predict(X_test)
y_test_check = y_df_test.values
calc_general_perf(y_pred, y_test_check)

正解率 = 66.532
適合率(precision) = 60.516
再現率(recall) = 47.649
F1値(F1-score) = 53.317


# 汎化性能まとめ

## DAY3の結果
| 分類手法 | 正解率 | 適合率 | 再現率 | F1値 |
| :---: | :---: | :--- | :---: | :---: |
| ロジスティック回帰 | 65.439 | 59.945 | 41.696 | 49.182 |
| 決定木 | 65.709 | 59.601 | 45.030 | 51.301 |
| ランダムフォレスト | 67.028 | 61.789 | 46.638 | 53.155 |
| アダブースト | 66.500 | 59.912 | 49.806 | 54.806 |
| 全結合ニューラルネットワーク | 66.252 | 59.353 | 50.325 | 54.467 |

## DAY4の結果
正解率はどの手法でもDAY3より向上した。  
**太字** がDAY3よりも向上した指標となる。

| 分類手法 | 正解率 | 適合率 | 再現率 | F1値 |
| :---: | :---: | :--- | :---: | :---: |
| ロジスティック回帰 | **65.735** | **60.379** | **42.384** | **49.806** |
| 決定木 | **65.915** | **60.011** | 45.026 | **51.449** |
| ランダムフォレスト | **67.512** | **62.598** | **47.209** | **53.826** |
| アダブースト | **66.511** | 59.879 | **50.028** | 54.512 |
| 全結合ニューラルネットワーク | **66.532** | **60.516** | 47.649 | 53.317 |
| k-最近傍法 | 66.799 | 61.443 | 46.243 | 52.771 |
