# LightBGMでベンチマークを作る
- とりあえずでベンチマークを作ります
- 初手LightBGM
- 前処理は特にしない
- KFoldでクロスバリデーションする

## 1.必要なライブラリを読み込む

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold

## 2.データのロード

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

## 3.ひとまず様子をみる

In [None]:
train

In [None]:
test

In [None]:
submission

In [None]:
for column in train.columns:
    print(column)

## 4. テスト対象のカラムを決める
とりあえず全部突っ込む

In [None]:
train_columns = [
    'id',
    'f1',
    'f2',
    'f3',
    'f4',
    'f5',
    'f6',
    'f7',
    'f8',
    'f9',
    'f10',
    'f11',
    'f12',
    'f13',
    'f14',
    'f15',
    'f16',
    'f17',
    'f18',
    'f19',
    'f20',
    'f21',
    'f22',
    'f23',
    'f24',
    'f25',
    'f26',
    'f27',
    'f28',
    'f29',
    'f30',
    'f31',
    'f32',
    'f33',
    'f34',
    'f35',
    'f36',
    'f37',
    'f38',
    'f39',
    'f40',
    'f41',
    'f42',
    'f43',
    'f44',
    'f45',
    'f46',
    'f47',
    'f48',
    'f49',
    'f50',
    'f51',
    'f52',
    'f53',
    'f54',
    'f55',
    'f56',
    'f57',
    'f58',
    'f59',
    'f60',
    'f61',
    'f62',
    'f63',
    'f64',
    'f65',
    'f66',
    'f67',
    'f68',
    'f69',
    'f70',
    'f71',
    'f72',
    'f73',
    'f74',
    'f75',
    'f76',
    'f77',
    'f78',
    'f79',
    'f80',
    'f81',
    'f82',
    'f83',
    'f84',
    'f85',
    'f86',
    'f87',
    'f88',
    'f89',
    'f90',
    'f91',
    'f92',
    'f93',
    'f94',
    'f95',
    'f96',
    'f97',
    'f98',
    'f99',
    'f100',
    'f101',
    'f102',
    'f103',
    'f104',
    'f105',
    'f106',
    'f107',
    'f108',
    'f109',
    'f110',
    'f111',
    'f112',
    'f113',
    'f114',
    'f115',
    'f116',
    'f117',
    'f118',
]
target_column = 'claim'

In [None]:
# NaNをカウントすると精度が上がる
X = pd.concat([train, test])
X['n_missing'] = X[train_columns].isna().sum(axis=1).astype('int')
X['n_missing_std'] = X[train_columns].isna().std(axis=1).astype('float')
X['mean_orig'] = X[train_columns].mean(axis=1)

X['med'] = X[train_columns].median(axis=1)
X['max'] = X[train_columns].max(axis=1)
X['abs_max'] = X[train_columns].abs().max(axis=1)
X['min'] = X[train_columns].min(axis=1)
X['skew'] = X[train_columns].skew(axis=1)

X[train_columns] = X[train_columns].fillna(X[train_columns].mean())

train_columns.extend(['n_missing', 'n_missing_std', 'mean_orig', 'med', 'max', 'abs_max', 'min', 'skew'])

for column in train_columns:
    print(column)

train = X.iloc[:train.shape[0]]
test = X.iloc[train.shape[0]:]

## 5. 学習・評価

In [None]:
def lgb_train(x, y, test, categorical_features=None, params={'objective': 'rmse'}, split_size=5):
    # 評価結果を保存する変数
    test_predictions = np.zeros(test.shape[0])
    kfold = KFold(n_splits = split_size, random_state = 1, shuffle = True)

    for fold, (train_index, value_index) in enumerate(kfold.split(train)):
        # 分割する
        x_train, x_val = x.iloc[train_index], x.iloc[value_index]
        y_train, y_val = y.iloc[train_index], y.iloc[value_index]

        # 学習
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
        lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train, categorical_feature=categorical_features)
        model = lgb.train(params = params,
                          num_boost_round=1000,
                          train_set = lgb_train, 
                          valid_sets = [lgb_train, lgb_eval], 
                          verbose_eval = 250,
                          early_stopping_rounds=50)

        # split_sizeで分割して、毎回評価する
        test_predictions += model.predict(test)
        
        print(f'fold{fold}のスコア：{1 - abs(y - model.predict(x)).sum() / x.shape[0]}')
    
    # 11回評価して、5回未満なら0、6回以上1が出たら1にして返す
    test_predictions[test_predictions <= split_size / 2] = 0
    test_predictions[test_predictions >= 1] = 1
    return test_predictions


submission[target_column] = lgb_train(train[train_columns], train[target_column], test[train_columns], params={'objective': 'binary'}, split_size=11)

In [None]:
submission

## 6. submissionに出力する

In [None]:
submission[['id',target_column]].to_csv('submission.csv', index=False)