In [2]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
import datetime

sns.set_style('whitegrid')
os.environ['KMP_DUPLICATE_LIB_OK']='True'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  from numpy.core.umath_tests import inner1d


In [3]:
TRAIN_DATA_PATH = '../input/train.csv'
TEST_DATA_PATH = '../input/test.csv'

origin_train = pd.read_csv(TRAIN_DATA_PATH)
origin_test = pd.read_csv(TEST_DATA_PATH)
target = origin_train["target"]

In [31]:
def get_new_feature(file_name):
    """
    targetとの相関が高い新しい特徴量を作成する
    """
    
    #相関0.０７以上
    border= 0.07
    
    best_corr_var = pd.read_csv(file_name).rename(columns={"Unnamed: 0":"ID_code", "0":"corr"})
    #相関で並び替え→上位を取得
    var_name_li = best_corr_var.sort_values("corr", ascending=False)[best_corr_var["corr"]>border]["ID_code"]
    id_code_li = []
    for var_name in var_name_li.values:
        id_code_li.append(var_name.replace("var_", "").split("_"))
    
    #新しい特徴量の作成
    new_train = pd.DataFrame([])
    new_test = pd.DataFrame([])
    for id_code_pair in id_code_li:
        var1 = f"var_{id_code_pair[0]}"
        var2 = f"var_{id_code_pair[1]}"
        new_train_feature = origin_train[var1] + origin_train[var2]
        new_test_feature = origin_test[var1] + origin_test[var2]
        
        new_train = pd.concat([new_train, new_train_feature], axis=1)
        new_test = pd.concat([new_test, new_test_feature], axis=1)
    
    return new_train, new_test, var_name_li

In [5]:
ADD = "../output/best_add_corr_var.csv"
MULTI = "multi_best_corr_var.csv"

new_train, new_test, var_name_li = get_new_feature(f"../output/")

  # This is added back by InteractiveShellApp.init_path()


In [8]:
new_train.columns = var_name_li.values
new_test.columns = var_name_li.values

In [9]:
target = origin_train["target"]
train = pd.concat([origin_train, new_train], axis=1).drop("ID_code", axis=1).drop("target", axis=1)
test = pd.concat([origin_test, new_test], axis=1).drop("ID_code", axis=1)

In [10]:
features = train.columns

In [11]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [12]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

In [13]:
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("Fold {}".format(fold_))
    #argument
    trn_train, trn_target= augment(np.array(train.iloc[trn_idx]), np.array(target.iloc[trn_idx]))
    trn_data = lgb.Dataset(trn_train, label=trn_target)
    val_train, val_target= augment(np.array(train.iloc[val_idx]), np.array(target.iloc[val_idx]))
    val_data = lgb.Dataset(val_train, label=val_target)

    num_round = 30000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = features
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.900402	valid_1's auc: 0.891418
[2000]	training's auc: 0.92255	valid_1's auc: 0.912423
[3000]	training's auc: 0.932799	valid_1's auc: 0.921616
[4000]	training's auc: 0.938845	valid_1's auc: 0.926666
[5000]	training's auc: 0.943178	valid_1's auc: 0.930142
[6000]	training's auc: 0.946693	valid_1's auc: 0.93281
[7000]	training's auc: 0.949958	valid_1's auc: 0.935154
[8000]	training's auc: 0.952885	valid_1's auc: 0.937261
[9000]	training's auc: 0.955631	valid_1's auc: 0.939146
[10000]	training's auc: 0.957958	valid_1's auc: 0.940626
[11000]	training's auc: 0.960015	valid_1's auc: 0.941883
[12000]	training's auc: 0.962113	valid_1's auc: 0.943147
[13000]	training's auc: 0.963929	valid_1's auc: 0.944057
[14000]	training's auc: 0.965644	valid_1's auc: 0.944839
[15000]	training's auc: 0.967323	valid_1's auc: 0.945625
[16000]	training's auc: 0.968775	valid_1's auc: 0.946226
[17000]	training's auc: 0.97

[13000]	training's auc: 0.963631	valid_1's auc: 0.941954
[14000]	training's auc: 0.965402	valid_1's auc: 0.942747
[15000]	training's auc: 0.967034	valid_1's auc: 0.943448
[16000]	training's auc: 0.968456	valid_1's auc: 0.943985
[17000]	training's auc: 0.969948	valid_1's auc: 0.944663
[18000]	training's auc: 0.971343	valid_1's auc: 0.945226
[19000]	training's auc: 0.97266	valid_1's auc: 0.945774
[20000]	training's auc: 0.973917	valid_1's auc: 0.946213
[21000]	training's auc: 0.97509	valid_1's auc: 0.946558
[22000]	training's auc: 0.976245	valid_1's auc: 0.946935
[23000]	training's auc: 0.9774	valid_1's auc: 0.947348
[24000]	training's auc: 0.978477	valid_1's auc: 0.947728
[25000]	training's auc: 0.979475	valid_1's auc: 0.94797
[26000]	training's auc: 0.980482	valid_1's auc: 0.948314
[27000]	training's auc: 0.981418	valid_1's auc: 0.948569
[28000]	training's auc: 0.982334	valid_1's auc: 0.948847
[29000]	training's auc: 0.983182	valid_1's auc: 0.949152
[30000]	training's auc: 0.983995	val

[26000]	training's auc: 0.980245	valid_1's auc: 0.954308
[27000]	training's auc: 0.98118	valid_1's auc: 0.95461
[28000]	training's auc: 0.982061	valid_1's auc: 0.954873
[29000]	training's auc: 0.982881	valid_1's auc: 0.9551
[30000]	training's auc: 0.983716	valid_1's auc: 0.955337
Did not meet early stopping. Best iteration is:
[30000]	training's auc: 0.983716	valid_1's auc: 0.955337
Fold 9
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.900289	valid_1's auc: 0.900298
[2000]	training's auc: 0.922317	valid_1's auc: 0.918785
[3000]	training's auc: 0.932289	valid_1's auc: 0.926817
[4000]	training's auc: 0.938429	valid_1's auc: 0.931419
[5000]	training's auc: 0.942716	valid_1's auc: 0.934535
[6000]	training's auc: 0.946422	valid_1's auc: 0.936895
[7000]	training's auc: 0.949495	valid_1's auc: 0.938807
[8000]	training's auc: 0.952273	valid_1's auc: 0.940566
[9000]	training's auc: 0.954872	valid_1's auc: 0.942172
[10000]	training's auc: 0.957289	valid_

In [None]:
cols = (feature_importance[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:6000].index)
best_features = feature_importance.loc[feature_importance.Feature.isin(cols)]

plt.figure(figsize=(14,30))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')

In [14]:
feature_importance.to_csv(f"importamce_1.csv", index=False)

In [29]:
score = round(roc_auc_score(target, oof), 4)
sub = pd.DataFrame({"ID_code": origin_test["ID_code"]})
sub["target"]=predictions
print(sub)
    

            ID_code    target
0            test_0  0.062381
1            test_1  0.167545
2            test_2  0.326036
3            test_3  0.094398
4            test_4  0.044269
5            test_5  0.000279
6            test_6  0.002709
7            test_7  0.349089
8            test_8  0.000551
9            test_9  0.001562
10          test_10  0.243709
11          test_11  0.011710
12          test_12  0.037668
13          test_13  0.020761
14          test_14  0.003260
15          test_15  0.020668
16          test_16  0.481568
17          test_17  0.044625
18          test_18  0.139586
19          test_19  0.001752
20          test_20  0.671551
21          test_21  0.056589
22          test_22  0.002022
23          test_23  0.071797
24          test_24  0.004492
25          test_25  0.041191
26          test_26  0.110753
27          test_27  0.003336
28          test_28  0.256337
29          test_29  0.017935
...             ...       ...
199970  test_199970  0.051231
199971  te

In [30]:
sub.to_csv(f"submission_new_{score}.csv", index=False)