In [14]:
# モジュールのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
%matplotlib inline

from sklearn.model_selection import KFold

import lightgbm as lgb
import catboost as cb
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split


# data_road
train_data = pd.read_csv("../data/train.tsv", delimiter='\t')
test_data = pd.read_csv("../data/test.tsv", delimiter='\t')
x_1, y_1 = train_data.iloc[:, 3:22], train_data["LeagueIndex"]-1
x_2, y_2 = train_data.drop("ComplexUnitsMade", axis=1).iloc[:, 3:22], train_data["LeagueIndex"]-1
x_test= test_data.iloc[:, 2:21]

In [17]:
def catboost_train_1(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv, X_test, y_test):
    # データを格納する
    # 学習用
    CatBoost_train = cb.Pool(X_train_cv, label=y_train_cv)
    # 検証用
    CatBoost_eval = cb.Pool(X_eval_cv, label=y_eval_cv)

    # パラメータを設定
    params = {
        'loss_function': 'MultiClass',    # 多値分類問題
        'num_boost_round': 200,          # 学習の回数
        'early_stopping_rounds': 10       # アーリーストッピングの回数
    }

    # 学習
    catb = cb.CatBoost(params)
    catb.fit(CatBoost_train, eval_set=[CatBoost_eval], verbose=False)

    # テストデータで予測
    y_pred = catb.predict(X_test, prediction_type='Probability')
    y_pred_max = np.argmax(y_pred, axis=1)

    # Accuracy の計算
    accuracy = sum(y_test == y_pred_max) / len(y_test)
    print('CatBoost Accuracy:', accuracy)

    return(catb, y_pred_max, accuracy)

def catboost_train_2(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv, X_test, y_test):
    # データを格納する
    # 学習用
    CatBoost_train = cb.Pool(X_train_cv, label=y_train_cv)
    # 検証用
    CatBoost_eval = cb.Pool(X_eval_cv, label=y_eval_cv)

    # パラメータを設定
    params = {
        'loss_function': 'MultiClass',    # 多値分類問題
        'num_boost_round': 200,          # 学習の回数
        'early_stopping_rounds': 10       # アーリーストッピングの回数
    }

    # 学習
    catb = cb.CatBoost(params)
    catb.fit(CatBoost_train, eval_set=[CatBoost_eval], verbose=False)

    # テストデータで予測
    y_pred = catb.predict(X_test, prediction_type='Probability')
    y_pred_max = np.argmax(y_pred, axis=1)

    # Accuracy の計算
    accuracy = sum(y_test == y_pred_max) / len(y_test)
    print('CatBoost Accuracy:', accuracy)

    return(catb, y_pred_max, accuracy)

In [21]:
# Voting
# 各5つのモデルを保存するリストの初期化
catb_models_1 = []
catb_models_2 = []
# 各5つのモデルの正答率を保存するリストの初期化
catb_accuracies_1 = []
catb_accuracies_2 = []
# 学習のカウンター
loop_counts = 1

features = ['Age', 'HoursPerWeek','TotalHours', 'APM']


# 学習データとテストデータに分ける
X_train, X_test, y_train, y_test = train_test_split(x_1, y_1,
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=y_1)
# 学習データとテストデータに分ける
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(x_2, y_2,
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=y_2)

# 各5つのモデルの予測を保存する配列の初期化（5seed*5cv*3モデル）
first_preds = np.zeros((len(y_test), 5*5*2))


# ５つのシード値で予測
for seed_no in range(5):

    # 学習データの数だけの数列（0行から最終行まで連番）
    row_no_list = list(range(len(y_train)))

    # KFoldクラスをインスタンス化（これを使って5分割する）
    K_fold = StratifiedKFold(n_splits=5, shuffle=True,  random_state= seed_no)

    # KFoldクラスで分割した回数だけ実行（ここでは5回）
    for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y_train):
        # ilocで取り出す行を指定
        X_train_cv = X_train.iloc[train_cv_no, :]
        y_train_cv = pd.Series(y_train).iloc[train_cv_no]
        X_eval_cv = X_train.iloc[eval_cv_no, :]
        y_eval_cv = pd.Series(y_train).iloc[eval_cv_no]
         # ilocで取り出す行を指定
        X_train_cv_1 = X_train_1.iloc[train_cv_no, :]
        y_train_cv_1 = pd.Series(y_train_1).iloc[train_cv_no]
        X_eval_cv_1 = X_train_1.iloc[eval_cv_no, :]
        y_eval_cv_1 = pd.Series(y_train_1).iloc[eval_cv_no]


        # CatBoostの学習を実行
        catb_1, catb_pred_1, catb_accuracy_1 = catboost_train_1(X_train_cv, y_train_cv,
                                                        X_eval_cv, y_eval_cv,
                                                        X_test, y_test)

        # CatBoostの学習を実行
        catb_2, catb_pred_2, catb_accuracy_2 = catboost_train_2(X_train_cv_1, y_train_cv_1,
                                                        X_eval_cv_1, y_eval_cv_1,
                                                        X_test_1, y_test_1)

        # 学習が終わったモデルをリストに入れておく

        catb_models_1.append(catb_1)
        catb_models_2.append(catb_2)

        first_preds[:, loop_counts-1 ] = catb_pred_1
        first_preds[:, loop_counts-1 + 25] = catb_pred_2


        # 実行回数のカウント
        loop_counts += 1

CatBoost Accuracy: 0.42058823529411765
CatBoost Accuracy: 0.4411764705882353
CatBoost Accuracy: 0.4235294117647059
CatBoost Accuracy: 0.4264705882352941
CatBoost Accuracy: 0.43823529411764706
CatBoost Accuracy: 0.4470588235294118
CatBoost Accuracy: 0.4323529411764706
CatBoost Accuracy: 0.45588235294117646
CatBoost Accuracy: 0.4235294117647059
CatBoost Accuracy: 0.4147058823529412
CatBoost Accuracy: 0.4441176470588235
CatBoost Accuracy: 0.4411764705882353
CatBoost Accuracy: 0.4117647058823529
CatBoost Accuracy: 0.4088235294117647
CatBoost Accuracy: 0.4264705882352941
CatBoost Accuracy: 0.4294117647058823
CatBoost Accuracy: 0.4147058823529412
CatBoost Accuracy: 0.4088235294117647
CatBoost Accuracy: 0.4088235294117647
CatBoost Accuracy: 0.4176470588235294
CatBoost Accuracy: 0.4264705882352941
CatBoost Accuracy: 0.4088235294117647
CatBoost Accuracy: 0.39705882352941174
CatBoost Accuracy: 0.4441176470588235
CatBoost Accuracy: 0.4323529411764706
CatBoost Accuracy: 0.42058823529411765
CatBoos

In [22]:
# 単独のモデルでの、テストデータの正答率
print('CatBoost Accuracy: ', np.array(catb_accuracies_1).mean())
print('CatBoost std: ', np.array(catb_accuracies_1).std())

print('CatBoost Accuracy: ', np.array(catb_accuracies_2).mean())
print('CatBoost std: ', np.array(catb_accuracies_2).std())

CatBoost Accuracy:  nan
CatBoost std:  nan
CatBoost Accuracy:  nan
CatBoost std:  nan


  print('CatBoost Accuracy: ', np.array(catb_accuracies_1).mean())
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  print('CatBoost Accuracy: ', np.array(catb_accuracies_2).mean())
