In [1]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
import datetime

sns.set_style('whitegrid')
os.environ['KMP_DUPLICATE_LIB_OK']='True'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  from numpy.core.umath_tests import inner1d


In [2]:
TRAIN_DATA_PATH = '../input/train.csv'
TEST_DATA_PATH = '../input/test.csv'

origin_train = pd.read_csv(TRAIN_DATA_PATH)
origin_test = pd.read_csv(TEST_DATA_PATH)
target = origin_train["target"]

In [4]:
def get_new_feature():
    """
    targetとの相関が高い新しい特徴量を作成する
    """
    
    #相関0.０７以上
    border= 0.07
    
    best_corr_var = pd.read_csv(f"../output/best_corr_var.csv").rename(columns={"Unnamed: 0":"ID_code", "0":"corr"})
    #相関で並び替え→上位を取得
    var_name_li = best_corr_var.sort_values("corr", ascending=False)[best_corr_var["corr"]>border]["ID_code"]
    id_code_li = []
    for var_name in var_name_li.values:
        id_code_li.append(var_name.replace("var_", "").split("_"))
    
    #新しい特徴量の作成
    new_train = pd.DataFrame([])
    new_test = pd.DataFrame([])
    for id_code_pair in id_code_li:
        var1 = f"var_{id_code_pair[0]}"
        var2 = f"var_{id_code_pair[1]}"
        new_train_feature = origin_train[var1] + origin_train[var2]
        new_test_feature = origin_test[var1] + origin_test[var2]
        
        new_train = pd.concat([new_train, new_train_feature], axis=1)
        new_test = pd.concat([new_test, new_test_feature], axis=1)
    
    return new_train, new_test, var_name_li

In [5]:
new_train, new_test, var_name_li = get_new_feature()

  # This is added back by InteractiveShellApp.init_path()


In [6]:
new_train.columns = var_name_li.values
new_test.columns = var_name_li.values

In [64]:
target = origin_train["target"]
train = pd.concat([origin_train, new_train], axis=1).drop("ID_code", axis=1).drop("target", axis=1)
test = pd.concat([origin_test, new_test], axis=1).drop("ID_code", axis=1)

In [65]:
features = train.columns

In [143]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [67]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

In [None]:
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("Fold {}".format(fold_))
    #argument
    trn_train, trn_target= augment(np.array(train.iloc[trn_idx]), np.array(target.iloc[trn_idx]))
    trn_data = lgb.Dataset(trn_train, label=trn_target)
    val_train, val_target= augment(np.array(train.iloc[val_idx]), np.array(target.iloc[val_idx]))
    val_data = lgb.Dataset(val_train, label=val_target)

    num_round = 30000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = features
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.900532	valid_1's auc: 0.891371
[2000]	training's auc: 0.922232	valid_1's auc: 0.911308
[3000]	training's auc: 0.932501	valid_1's auc: 0.920469
[4000]	training's auc: 0.938705	valid_1's auc: 0.925576
[5000]	training's auc: 0.94299	valid_1's auc: 0.929202
[6000]	training's auc: 0.946567	valid_1's auc: 0.931944
[7000]	training's auc: 0.949742	valid_1's auc: 0.934223
[8000]	training's auc: 0.952622	valid_1's auc: 0.936341
[9000]	training's auc: 0.955274	valid_1's auc: 0.938086
[10000]	training's auc: 0.957754	valid_1's auc: 0.939778
[11000]	training's auc: 0.959881	valid_1's auc: 0.940973
[12000]	training's auc: 0.96197	valid_1's auc: 0.942136
[13000]	training's auc: 0.963743	valid_1's auc: 0.943049
[14000]	training's auc: 0.965583	valid_1's auc: 0.944008
[15000]	training's auc: 0.96718	valid_1's auc: 0.944717
[16000]	training's auc: 0.968705	valid_1's auc: 0.945351
[17000]	training's auc: 0.970

In [None]:
cols = (feature_importance[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:6000].index)
best_features = feature_importance.loc[feature_importance.Feature.isin(cols)]

plt.figure(figsize=(14,30))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')

In [None]:
feature_importance.to_csv(f"importamce_1.csv", index=False)

In [None]:
score = round(roc_auc_score(target, oof), 4)
sub = pd.DataFrame({"ID_code": test.index})
sub["target"]=predictions
sub.to_csv(f"submission_new_var{file_name}_{score}.csv", index=False)