In [2]:
# import library
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import time
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
# data_road
train_data = pd.read_csv("../data/train.tsv", delimiter='\t')
test_data = pd.read_csv("../data/test.tsv", delimiter='\t')
x, y = train_data.iloc[:, 3:21], train_data["LeagueIndex"]-1
x_train, x_val, y_train, y_val = train_test_split(x, y)
x_test= test_data.iloc[:, 2:20]

In [4]:
start_time = time.time()

dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_val, label=y_val)
dtest = xgb.DMatrix(x_test)

xgb_params = {'objective': 'multi:softmax', 'num_class': 8}

evals = [(dtrain, 'train'), (dvalid, 'vali')]

xgb_model = xgb.train(xgb_params,
                      dtrain,
                      evals=evals,
                      )

print('time:{}'.format(time.time()-start_time))

[0]	train-mlogloss:1.69932	vali-mlogloss:1.85520
[1]	train-mlogloss:1.45983	vali-mlogloss:1.73357
[2]	train-mlogloss:1.28681	vali-mlogloss:1.64832
[3]	train-mlogloss:1.14916	vali-mlogloss:1.58188
[4]	train-mlogloss:1.02678	vali-mlogloss:1.54110
[5]	train-mlogloss:0.94172	vali-mlogloss:1.50049
[6]	train-mlogloss:0.85502	vali-mlogloss:1.47051
[7]	train-mlogloss:0.78824	vali-mlogloss:1.44621
[8]	train-mlogloss:0.73124	vali-mlogloss:1.42759
[9]	train-mlogloss:0.66798	vali-mlogloss:1.41847
time:0.16503310203552246


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [5]:
from sklearn.metrics import accuracy_score

pred = xgb_model.predict(dvalid)
score = accuracy_score(y_val, pred)
print('score:{0:.4f}'.format(score))

score:0.3976


In [6]:
start = time.time()

lgb_train = lgb.Dataset(x_train, y_train)
lgb_val = lgb.Dataset(x_val, y_val, reference=lgb_train)
lgb_test = lgb.Dataset(x_test)

params = {'objective' : 'multiclass','num_class' : 8}

lgb_model = lgb.train(params=params,
                        train_set=lgb_train,
                        valid_sets=[lgb_train, lgb_val],
                        valid_names=['Train', 'Valid'])

print('elapsed_time:{}'.format(time.time()-start))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3798
[LightGBM] [Info] Number of data points in the train set: 1272, number of used features: 18
[LightGBM] [Info] Start training from score -2.914239
[LightGBM] [Info] Start training from score -2.221092
[LightGBM] [Info] Start training from score -1.830226
[LightGBM] [Info] Start training from score -1.405343
[LightGBM] [Info] Start training from score -1.451252
[LightGBM] [Info] Start training from score -1.787054
[LightGBM] [Info] Start training from score -4.103823
[LightGBM] [Info] Start training from score -4.315132
[1]	Train's multi_logloss: 1.56998	Valid's multi_logloss: 1.64814
[2]	Train's multi_logloss: 1.43776	Valid's multi_logloss: 1.58928
[3]	Train's multi_logloss: 1.3304	Valid's multi_logloss: 1.54484
[4]	Train's multi_logloss: 1.23709	Valid's multi_logloss: 1.51022
[5]	Train's multi_logloss: 1.15443	Valid's multi_logloss: 1.47981
[6]	Train's multi_logloss: 1.08227	Valid's multi_loglos

In [7]:
from sklearn.metrics import accuracy_score

lgb_pred = lgb_model.predict(x_val)
lgb_pred_low = np.argmax(lgb_pred, axis=1)
score = accuracy_score(y_val, lgb_pred_low)
print('score:{0:.4f}'.format(score))

score:0.4306


In [8]:
import csv

y_pred = lgb_model.predict(x_test)
y_pred_max = np.argmax(y_pred, axis=1)
with open("../result/lightgbm_test.csv", "w") as csv_file:
    for pred_low, test_id in zip(y_pred_max, test_data['Unnamed: 0']):
        writer = csv.writer(csv_file)
        low = [test_id, int(pred_low+1)]
        writer.writerow(low)

    csv_file.close()


In [17]:
cat_pred = lgb_model.predict(x_val)
cat_pred_low = np.argmax(lgb_pred, axis=1)
score = accuracy_score(y_val, cat_pred_low)
print('score:{0:.4f}'.format(score))

score:0.4306


In [12]:
start = time.time()

# 専用の型に変換
catb_train = cb.Pool(x_train, label=y_train)
catb_valid = cb.Pool(x_val, label=y_val)
catb_test = cb.Pool(x_test)

# パラメータを設定
params = {'loss_function': 'MultiClass'}

# 学習
catb_model = cb.CatBoost(params)
catb_model.fit(catb_train,
               eval_set=[catb_valid],
               verbose=False)

print('elapsed_time:{}'.format(time.time()-start))

elapsed_time:4.368575811386108


In [14]:
print(cat_pred.shape)

(1698, 8)


In [16]:
import csv

cat_pred = catb_model.predict(catb_test)
cat_pred_sca = np.argmax(cat_pred, axis=1)
with open("../result/catboost_test.csv", "w") as csv_file:
    for pred_low, test_id in zip(cat_pred_sca, test_data['Unnamed: 0']):
        writer = csv.writer(csv_file)
        low = [test_id, int(pred_low+1)]
        writer.writerow(low)

    csv_file.close()

In [None]:
def k_fold_cv(x, y, model, k):
    cv = KFold(n_splits=k, random_state=0, shuffle=True)
    mse_list = []
    for train_index, test_index in cv.split(x):
        # get train and test data
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit model
        model.fit(x_train, y_train)
        # predict test data
        y_pred = model.predict(x_test)
        # loss
        mse = np.mean((y_pred - y_test)**2)
        mse_list.append(mse)
    print(f"MSE({k}FoldCV): {np.mean(mse_list)}")
    print(f"std: {np.std(mse_list)}")