In [1]:
import os
import sys
import datetime
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV

sns.set_style('whitegrid')
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
TRAIN_DATA_PATH = '../input/train.csv'
TEST_DATA_PATH = '../input/test.csv'

origin_train = pd.read_csv(TRAIN_DATA_PATH)
origin_test = pd.read_csv(TEST_DATA_PATH)

In [3]:
features = origin_train.columns[2:]
target = pd.DataFrame(origin_train, columns=['target'])
print(target.shape)
df_train = pd.DataFrame(origin_train, columns=features)
df_test = pd.DataFrame(origin_test, columns=features)

(200000, 1)


In [4]:
"""
trainとtestの新しい特徴量を作る関数
feature_num = var_{feature_num}の特徴量を作る
"""
def new_feature(df_train, df_test, feature_num):   
    new_train = pd.DataFrame([])
    new_test = pd.DataFrame([])
    for i in range(feature_num+1, 200):
        new_train[f'var_{feature_num}_{i}'] = df_train[f'var_{feature_num}'] + df_train[f'var_{i}']
        new_test[f'var_{feature_num}_{i}'] = df_test[f'var_{feature_num}'] + df_test[f'var_{i}']
    return new_train, new_test

In [5]:
"""
元のデータとスタックして学習させたいときはこの関数使う
スタックしないで学習するときはnew_featureの戻り値をtrain, testで受け取って学習させる
"""

def stack_data(feature_num):
    new_train, new_test = new_feature(df_train, df_test, feature_num)
    train = new_train
    test =  new_test
    return train, test, new_train

In [36]:
"""
ストレージに余裕があればfeature_numをfor文で回して特徴量ごとのcsvファイル作ってあとでまとめて学習も有り
そのまま学習したければ下記の関数はコメントアウトして大丈夫
"""
def new_feature_csv(feature_num, new_train, new_test):
    new_train.to_csv(f'../output/train_new_var_{feature_num}.csv')
    new_test.to_csv(f'../output/test_new_var_{feature_num}.csv')

In [6]:
#ログの出力
logger = logging.getLogger()
now = datetime.datetime.now()
handler = logging.FileHandler("../log/make_new_feature_log{0:%Y%m%d%H%M%S}.csv".format(now), encoding="UTF-8")
handler.setLevel(logging.INFO)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [35]:
for num in range(3, 200):
    feature_num = num
    
    logger.info(f"START: var_{feature_num}")
    
    train, test, new_train = stack_data(feature_num)
    #新しい特徴量ファイルの作成
    new_feature_csv(feature_num, train, test)
    
    features = new_train.columns

    param = {
        'bagging_freq': 5,
        'bagging_fraction': 0.4,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.05,
        'learning_rate': 0.01,
        'max_depth': -1,  
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 13,
        'num_threads': 8,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1
    }

    folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    feature_importance = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])

        num_round = 30000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
        oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)

        fold_importance = pd.DataFrame()
        fold_importance["Feature"] = features
        fold_importance["importance"] = clf.feature_importance()
        fold_importance["fold"] = fold_ + 1
        feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

        predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

    print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
    logger.info("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

    feature_importance = feature_importance.sort_values('importance', ascending=False)

    #取得したimportanceの結果csv
    feature_importance.to_csv(f"../output/new_feature_importance_var{feature_num}.csv", index=False)

    score = round(roc_auc_score(target, oof), 4)
    sub = pd.DataFrame({"ID_code": origin_test.ID_code.values})
    sub["target"]=predictions
    sub.to_csv(f"submission_new_var{feature_num}_{score}.csv", index=False)

KeyboardInterrupt: 