In [10]:
import lightgbm as lgb
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("../data/train.tsv", delimiter='\t')
test_data = pd.read_csv("../data/test.tsv", delimiter='\t')

In [3]:
train_x, train_y = train_data.iloc[:, 3:21], train_data["LeagueIndex"]-1
test_x = test_data.iloc[:, 2:20]

In [5]:
dtrain = lgb.Dataset(train_x, train_y, free_raw_data=False)
dtest = lgb.Dataset(test_x)

In [18]:
param = {'task': 'train',                # 学習、トレーニング ⇔　予測predict
          'boosting_type': 'gbdt',        # 勾配ブースティング
          'objective': 'multiclass',      # 目的関数：多値分類、マルチクラス分類
          'metric': 'multi_logloss',      # 分類モデルの性能を測る指標
          'num_class': 8,                 # 目的変数のクラス数
         }

In [23]:
# 学習
evaluation_results = {}                                     # 学習の経過を保存する箱
model = lgb.train(param,                                   # 上記で設定したパラメータ
                  dtrain,                                # 使用するデータセット
                  num_boost_round=10000,                     # 学習の回数
                  valid_names=['train', 'valid'],           # 学習経過で表示する名称
                  valid_sets=[dtrain, dtrain],         # モデル検証のデータセット
                  evals_result=evaluation_results,          # 学習の経過を保存
                  early_stopping_rounds=20,                 # アーリーストッピング
                  verbose_eval=10)                          # 学習の経過の表示(10回毎)

# 最もスコアが良いときのラウンドを保存
optimum_boost_rounds = model.best_iteration

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3892
[LightGBM] [Info] Number of data points in the train set: 1697, number of used features: 18
[LightGBM] [Info] Start training from score -2.904018
[LightGBM] [Info] Start training from score -2.243660
[LightGBM] [Info] Start training from score -1.812600
[LightGBM] [Info] Start training from score -1.415594
[LightGBM] [Info] Start training from score -1.465355
[LightGBM] [Info] Start training from score -1.766736
[LightGBM] [Info] Start training from score -4.178521
[LightGBM] [Info] Start training from score -4.140780
Training until validation scores don't improve for 20 rounds
[10]	valid's multi_logloss: 0.887192
[20]	valid's multi_logloss: 0.545889
[30]	valid's multi_logloss: 0.35721
[40]	valid's multi_logloss: 0.241949
[50]	valid's multi_logloss: 0.165627
[60]	valid's multi_logloss: 0.113814
[70]	valid's multi_logloss: 0.0791467
[80]	valid's multi_logloss: 0.055375
[90]	valid's multi_logloss:

In [24]:
# テストデータで予測
y_pred = model.predict(train_x, num_iteration=model.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)

# Accuracy の計算
accuracy = sum(train_y == y_pred_max) / len(train_y)
print('accuracy:', accuracy)

print(y_pred_max+1)

accuracy: 1.0
[5 3 4 ... 5 2 3]


In [1]:
# テストデータで予測
y_pred = model.predict(test_x, num_iteration=model.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)
y_pred_max

NameError: name 'model' is not defined

In [26]:
import csv

with open("../result/lightgbm_test.csv", "w") as csv_file:
    for pred_low, test_id in zip(y_pred_max, test_data['Unnamed: 0']):
        writer = csv.writer(csv_file)
        low = [test_id, int(pred_low+1)]
        writer.writerow(low)

    csv_file.close()
