In [None]:
# モジュールのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
%matplotlib inline

from sklearn.model_selection import KFold

import lightgbm as lgb
import catboost as cb
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split


# data_road
train_data = pd.read_csv("../data/train.tsv", delimiter='\t')
test_data = pd.read_csv("../data/test.tsv", delimiter='\t')
x, y = train_data.iloc[:, 3:21], train_data["LeagueIndex"]-1
x_train, x_val, y_train, y_val = train_test_split(x, y)
x_test= test_data.iloc[:, 2:20]

In [None]:
# カテゴリー変数
categorical_features = ['sepal_cat', 'petal_cat']


# 5-fold CVモデルの学習
# 5つのモデルを保存するリストの初期化
models = []

# 学習データの数だけの数列（0行から最終行まで連番）
row_no_list = list(range(len(y)))

# KFoldクラスをインスタンス化（これを使って5分割する）
K_fold = StratifiedKFold(n_splits=5, shuffle=True,  random_state=42)

# KFoldクラスで分割した回数だけ実行（ここでは5回）
for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y):
    # ilocで取り出す行を指定
    X_train_cv = x.iloc[train_cv_no, :]
    y_train_cv = pd.Series(y).iloc[train_cv_no]
    X_eval_cv = x.iloc[eval_cv_no, :]
    y_eval_cv = pd.Series(y).iloc[eval_cv_no]

    # 学習用
    lgb_train = lgb.Dataset(X_train_cv, y_train_cv)
    # 検証用
    lgb_eval = lgb.Dataset(X_eval_cv, y_eval_cv)

    # パラメータを設定
    params = {
              'objective': 'multiclass',      # 目的関数：多値分類、マルチクラス分類
              'metric': 'multi_logloss',      # 分類モデルの性能を測る指標
              'num_class': 8,                 # 目的変数のクラス数
             }

    # 学習
    evaluation_results = {}                                     # 学習の経過を保存する箱
    model = lgb.train(params,                                   # 上記で設定したパラメータ
                      lgb_train,
                      valid_names=['train', 'valid'],           # 学習経過で表示する名称
                      valid_sets=[lgb_train, lgb_eval],         # モデル検証のデータセット
                      evals_result=evaluation_results)

    # テストデータで予測する
    y_pred = model.predict(x_val, num_iteration=model.best_iteration)
    y_pred_max = np.argmax(y_pred, axis=1)

    # Accuracy を計算する
    accuracy = sum(y_val == y_pred_max) / len(y_val)
    print('accuracy:', accuracy)

    # 学習が終わったモデルをリストに入れておく
    models.append(model)

In [None]:
# 学習過程の可視化
plt.plot(evaluation_results['train']['multi_logloss'], label='train')
plt.plot(evaluation_results['valid']['multi_logloss'], label='valid')
plt.ylabel('Log loss')
plt.xlabel('Boosting round')
plt.title('Training performance')
plt.legend()
plt.savefig('cv_logloss.jpg')
plt.show()

In [15]:
params = {'loss_function': 'MultiClass'}
catb_model = cb.CatBoost(params)
k = 5
cv = KFold(n_splits=k, random_state=0, shuffle=True)
accuracy_list = []
for train_index, test_index in cv.split(x):
    print(train_index)
    # get train and test data
    train_x, test_x = x.iloc[train_index], x.iloc[test_index]
    train_y, test_y = y[train_index], y[test_index]
    # fit model
    catb_model.fit(x_train, y_train)
    # predict test data
    pred_y = catb_model.predict(test_x)
    pred_y_max = np.argmax(pred_y, axis=1)
    # loss
    score = accuracy_score(test_y, pred_y_max)
    accuracy_list.append(score)
    print('score:{0:.4f}'.format(score))

print(f"MSE({k}FoldCV): {np.mean(accuracy_list)}")
print(f"std: {np.std(accuracy_list)}")



[   0    1    2 ... 1694 1695 1696]
Learning rate set to 0.080104
0:	learn: 1.9995066	total: 63.6ms	remaining: 1m 3s
1:	learn: 1.9463151	total: 69.2ms	remaining: 34.5s
2:	learn: 1.8947269	total: 74ms	remaining: 24.6s
3:	learn: 1.8526974	total: 79ms	remaining: 19.7s
4:	learn: 1.8082643	total: 83.6ms	remaining: 16.6s
5:	learn: 1.7704926	total: 88.2ms	remaining: 14.6s
6:	learn: 1.7351764	total: 92.8ms	remaining: 13.2s
7:	learn: 1.7007734	total: 97.3ms	remaining: 12.1s
8:	learn: 1.6727437	total: 102ms	remaining: 11.2s
9:	learn: 1.6383035	total: 107ms	remaining: 10.6s
10:	learn: 1.6087862	total: 111ms	remaining: 10s
11:	learn: 1.5798991	total: 116ms	remaining: 9.55s
12:	learn: 1.5553060	total: 121ms	remaining: 9.16s
13:	learn: 1.5316376	total: 125ms	remaining: 8.8s
14:	learn: 1.5112511	total: 129ms	remaining: 8.47s
15:	learn: 1.4887303	total: 134ms	remaining: 8.22s
16:	learn: 1.4713567	total: 138ms	remaining: 8s
17:	learn: 1.4504388	total: 143ms	remaining: 7.81s
18:	learn: 1.4352431	total: 

In [16]:
import csv

cat_pred = catb_model.predict(x_test)
cat_pred_sca = np.argmax(cat_pred, axis=1)
with open("../result/catboost_test.csv", "w") as csv_file:
    for pred_low, test_id in zip(cat_pred_sca, test_data['Unnamed: 0']):
        writer = csv.writer(csv_file)
        low = [test_id, int(pred_low + 1)]
        writer.writerow(low)

    csv_file.close()

In [None]:
param = {'task': 'train',                # 学習、トレーニング ⇔　予測predict
          'boosting_type': 'gbdt',        # 勾配ブースティング
          'objective': 'multiclass',      # 目的関数：多値分類、マルチクラス分類
          'metric': 'multi_logloss',      # 分類モデルの性能を測る指標
          'num_class': 8,                 # 目的変数のクラス数
         }

In [14]:
k = 5
cv = KFold(n_splits=k, random_state=0, shuffle=True)
accuracy_list = []
for train_index, test_index in cv.split(x):
    print(train_index)
    # get train and test data
    train_x, test_x = x.iloc[train_index], x.iloc[test_index]
    train_y, test_y = y[train_index], y[test_index]

    dtrain = lgb.Dataset(train_x, train_y, free_raw_data=False)
    dtest = lgb.Dataset(test_x)
    # fit model
    lgb_model = lgb.train(param,                                   # 上記で設定したパラメータ
                  dtrain)
    # predict test data
    pred_y = lgb_model.predict(test_x)
    pred_y_max = np.argmax(pred_y, axis=1)
    # loss
    score = accuracy_score(test_y, pred_y_max)
    accuracy_list.append(score)
    print('score:{0:.4f}'.format(score))

print(f"MSE({k}FoldCV): {np.mean(accuracy_list)}")
print(f"std: {np.std(accuracy_list)}")

[100]	valid's multi_logloss: 0.012269
[110]	valid's multi_logloss: 0.00791872
[120]	valid's multi_logloss: 0.00513383
[130]	valid's multi_logloss: 0.00332584
[140]	valid's multi_logloss: 0.00216991
[150]	valid's multi_logloss: 0.00143269
[160]	valid's multi_logloss: 0.00094537
[170]	valid's multi_logloss: 0.000618455
[180]	valid's multi_logloss: 0.000401945
[190]	valid's multi_logloss: 0.000266626
[200]	valid's multi_logloss: 0.000179561
[210]	valid's multi_logloss: 0.000121062
[220]	valid's multi_logloss: 8.38575e-05
[230]	valid's multi_logloss: 6.04581e-05
[240]	valid's multi_logloss: 4.54794e-05
[250]	valid's multi_logloss: 3.64348e-05
[260]	valid's multi_logloss: 3.08001e-05
[270]	valid's multi_logloss: 2.70508e-05
[280]	valid's multi_logloss: 2.43361e-05
[290]	valid's multi_logloss: 2.21969e-05
[300]	valid's multi_logloss: 2.05101e-05
[310]	valid's multi_logloss: 1.91149e-05
[320]	valid's multi_logloss: 1.80336e-05
[330]	valid's multi_logloss: 1.7087e-05
[340]	valid's multi_loglos