In [38]:
# import library
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import time
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

In [65]:
# data_road
train_data = pd.read_csv("../data/train.tsv", delimiter='\t')
test_data = pd.read_csv("../data/test.tsv", delimiter='\t')
x, y = train_data.iloc[:, 3:22], train_data["LeagueIndex"]-1
x_train, x_val, y_train, y_val = train_test_split(x, y)
x_test= test_data.iloc[:, 2:21]

In [40]:
start_time = time.time()

dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_val, label=y_val)
dtest = xgb.DMatrix(x_test)

xgb_params = {'objective': 'multi:softmax', 'num_class': 8}

evals = [(dtrain, 'train'), (dvalid, 'vali')]

xgb_model = xgb.train(xgb_params,
                      dtrain,
                      evals=evals,
                      )

print('time:{}'.format(time.time()-start_time))

[0]	train-mlogloss:1.69619	vali-mlogloss:1.87588
[1]	train-mlogloss:1.45451	vali-mlogloss:1.74494
[2]	train-mlogloss:1.26938	vali-mlogloss:1.66721
[3]	train-mlogloss:1.12187	vali-mlogloss:1.60728
[4]	train-mlogloss:1.01341	vali-mlogloss:1.55670
[5]	train-mlogloss:0.91535	vali-mlogloss:1.51581
[6]	train-mlogloss:0.82879	vali-mlogloss:1.48947
[7]	train-mlogloss:0.74896	vali-mlogloss:1.46545
[8]	train-mlogloss:0.69265	vali-mlogloss:1.44883
[9]	train-mlogloss:0.64568	vali-mlogloss:1.43354
time:0.14955592155456543


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [41]:
from sklearn.metrics import accuracy_score

pred = xgb_model.predict(dvalid)
score = accuracy_score(y_val, pred)
print('score:{0:.4f}'.format(score))

score:0.3859


In [42]:
start = time.time()

lgb_train = lgb.Dataset(x_train, y_train)
lgb_val = lgb.Dataset(x_val, y_val, reference=lgb_train)
lgb_test = lgb.Dataset(x_test)

params = {'objective' : 'multiclass','num_class' : 8}

lgb_model = lgb.train(params=params,
                        train_set=lgb_train,
                        valid_sets=[lgb_train, lgb_val],
                        valid_names=['Train', 'Valid'])

print('elapsed_time:{}'.format(time.time()-start))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3804
[LightGBM] [Info] Number of data points in the train set: 1272, number of used features: 18
[LightGBM] [Info] Start training from score -2.914239
[LightGBM] [Info] Start training from score -2.235691
[LightGBM] [Info] Start training from score -1.870231
[LightGBM] [Info] Start training from score -1.392604
[LightGBM] [Info] Start training from score -1.437919
[LightGBM] [Info] Start training from score -1.796488
[LightGBM] [Info] Start training from score -4.057303
[LightGBM] [Info] Start training from score -4.152613
[1]	Train's multi_logloss: 1.56793	Valid's multi_logloss: 1.66311
[2]	Train's multi_logloss: 1.43366	Valid's multi_logloss: 1.60153
[3]	Train's multi_logloss: 1.32371	Valid's multi_logloss: 1.56154
[4]	Train's multi_logloss: 1.22993	Valid's multi_logloss: 1.5224
[5]	Train's multi_logloss: 1.14771	Valid's multi_logloss: 1.49126
[6]	Train's multi_logloss: 1.07235	Valid's multi_loglos

In [43]:
from sklearn.metrics import accuracy_score

lgb_pred = lgb_model.predict(x_val)
lgb_pred_low = np.argmax(lgb_pred, axis=1)
score = accuracy_score(y_val, lgb_pred_low)
print('score:{0:.4f}'.format(score))

score:0.4212


In [44]:
import csv

y_pred = lgb_model.predict(x_test)
y_pred_max = np.argmax(y_pred, axis=1)
with open("../result/lightgbm_test.csv", "w") as csv_file:
    for pred_low, test_id in zip(y_pred_max, test_data['Unnamed: 0']):
        writer = csv.writer(csv_file)
        low = [test_id, int(pred_low+1)]
        writer.writerow(low)

    csv_file.close()


In [45]:
cat_pred = lgb_model.predict(x_val)
cat_pred_low = np.argmax(lgb_pred, axis=1)
score = accuracy_score(y_val, cat_pred_low)
print('score:{0:.4f}'.format(score))

score:0.4212


In [46]:
start = time.time()

# 専用の型に変換
catb_train = cb.Pool(x_train, label=y_train)
catb_valid = cb.Pool(x_val, label=y_val)
catb_test = cb.Pool(x_test)

# パラメータを設定
params = {'loss_function': 'MultiClass'}

# 学習
catb_model = cb.CatBoost(params)
catb_model.fit(catb_train,
               eval_set=[catb_valid],
               verbose=False)

print('elapsed_time:{}'.format(time.time()-start))

elapsed_time:4.55399489402771


In [47]:
catb_model = cb.CatBoost(params)

In [63]:
k = 5
cv = KFold(n_splits=k, random_state=0, shuffle=True)
accuracy_list = []
for train_index, test_index in cv.split(x):
    print(train_index)
    # get train and test data
    train_x, test_x = x.iloc[train_index], x.iloc[test_index]
    train_y, test_y = y[train_index], y[test_index]
    # fit model
    catb_model.fit(x_train, y_train)
    # predict test data
    pred_y = catb_model.predict(test_x)
    pred_y_max = np.argmax(pred_y, axis=1)
    # loss
    score = accuracy_score(test_y, pred_y_max)
    accuracy_list.append(score)
    print('score:{0:.4f}'.format(score))

print(f"MSE({k}FoldCV): {np.mean(accuracy_list)}")
print(f"std: {np.std(accuracy_list)}")

[   0    1    2 ... 1694 1695 1696]


AttributeError: 'Booster' object has no attribute 'train'

In [None]:
import csv

cat_pred = catb_model.predict(catb_test)
cat_pred_sca = np.argmax(cat_pred, axis=1)
with open("../result/catboost_test.csv", "w") as csv_file:
    for pred_low, test_id in zip(cat_pred_sca, test_data['Unnamed: 0']):
        writer = csv.writer(csv_file)
        low = [test_id, int(pred_low+1)]
        writer.writerow(low)

    csv_file.close()

In [49]:
def k_fold_cv(x, y, model, k):
    cv = KFold(n_splits=k, random_state=0, shuffle=True)
    mse_list = []
    for train_index, test_index in cv.split(x):
        # get train and test data
        train_x, test_x = x.iloc[train_index], x.iloc[test_index]
        train_y, test_y = y[train_index], y[test_index]
        # fit model
        model.fit(x_train, y_train)
        # predict test data
        pred_y = model.predict(test_x)
        # loss
        mse = np.mean((pred_y - test_y)**2)
        mse_list.append(mse)
    print(f"MSE({k}FoldCV): {np.mean(mse_list)}")
    print(f"std: {np.std(mse_list)}")

In [50]:
k_fold_cv(x, y, cb.CatBoost(params), 10)

Learning rate set to 0.080104
0:	learn: 2.0055136	total: 12.4ms	remaining: 12.4s
1:	learn: 1.9531187	total: 17.9ms	remaining: 8.92s
2:	learn: 1.9027107	total: 22.5ms	remaining: 7.48s
3:	learn: 1.8599557	total: 27.5ms	remaining: 6.84s
4:	learn: 1.8193125	total: 32.1ms	remaining: 6.39s
5:	learn: 1.7797707	total: 36.7ms	remaining: 6.08s
6:	learn: 1.7438664	total: 41.1ms	remaining: 5.82s
7:	learn: 1.7083342	total: 45.3ms	remaining: 5.61s
8:	learn: 1.6820964	total: 49.3ms	remaining: 5.43s
9:	learn: 1.6484629	total: 53.2ms	remaining: 5.26s
10:	learn: 1.6211164	total: 57.2ms	remaining: 5.14s
11:	learn: 1.5933619	total: 61.2ms	remaining: 5.04s
12:	learn: 1.5702986	total: 65.2ms	remaining: 4.95s
13:	learn: 1.5488700	total: 69.4ms	remaining: 4.89s
14:	learn: 1.5272725	total: 73.3ms	remaining: 4.81s
15:	learn: 1.5073898	total: 77.5ms	remaining: 4.76s
16:	learn: 1.4890911	total: 81.6ms	remaining: 4.71s
17:	learn: 1.4675770	total: 85.6ms	remaining: 4.67s
18:	learn: 1.4517997	total: 90.1ms	remaining

ValueError: operands could not be broadcast together with shapes (170,8) (170,) 