###### Reference
https://www.kaggle.com/gpreda/santander-eda-and-prediction  
https://www.kaggle.com/deepak525/sctp-lightgbm-lb-0-899  
target 1的标签占比约10%  
没有缺失值

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
import os
import gc
import time
import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

from datetime import datetime as dt
import warnings
warnings.filterwarnings('ignore')

In [2]:
print(os.listdir("../input"))

['test.csv', 'sample_submission.csv', 'train.csv']


In [3]:
train_df = pd.read_csv('../input/train.csv').drop("ID_code",axis=1)
test_df = pd.read_csv('../input/test.csv').drop("ID_code",axis=1)

In [4]:
df_train = train_df.drop(["target"],axis=1).values

unique_samples = []
unique_count = np.zeros_like(df_train)
for feature in range(df_train.shape[1]):
    _, index_, count_ = np.unique(df_train[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

In [5]:
def process_data(train_df, test_df):
    idx = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    df = pd.concat([train_df,test_df.ix[real_samples_indexes]])
    for feat in idx:
        train_df[feat + "_count"] = train_df[feat].map(df[feat].value_counts(dropna=True)) 
        test_df[feat + "_count"] = test_df[feat].map(df[feat].value_counts(dropna=True))
        
        train_df[feat + "_sum"] = train_df[feat].map(df.groupby(feat)[feat].sum()) 
        test_df[feat + "_sum"] = test_df[feat].map(df.groupby(feat)[feat].sum()) 
        
        train_df[feat+"_copy"] = train_df[feat] * (train_df[feat + "_count"] > 1).astype(int)
        test_df[feat+"_copy"] = test_df[feat] * (test_df[feat + "_count"] > 1).astype(int)
        train_df[feat+"_copy"] = train_df[feat+"_copy"].replace(0,df[feat].mean())
        test_df[feat+"_copy"] = test_df[feat+"_copy"].replace(0,df[feat].mean())
        
    return train_df, test_df

In [6]:
train_df, test_df = process_data(train_df, test_df)

In [10]:
params ={
    'bagging_freq': 5,
    'bagging_fraction': 0.33,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.6,
    'learning_rate': 0.003,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'device':'gpu',  
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'num_threads': 2,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1
        }

In [11]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=4590)
train_features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
y = train_df["target"]

In [12]:
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
clfs = list()

for fold_n, (train_index, valid_index) in enumerate(folds.split(train_df,y)):
    print('Fold', fold_n+1, 'started at', time.ctime())
    X_train, y_train = train_df.iloc[train_index], y.iloc[train_index]
    X_valid, y_valid = train_df.iloc[valid_index], y.iloc[valid_index]
        
    train_data = lgb.Dataset(X_train[train_features], label=y_train,)
    valid_data = lgb.Dataset(X_valid[train_features], label=y_valid)
        
    model = lgb.train(params,
                      train_data,
                      num_boost_round=100000,
                      valid_sets = [valid_data],
                      verbose_eval=2000,
                      early_stopping_rounds=3500,)
    
    oof_preds[valid_index] += model.predict(X_valid[train_features], num_iteration=model.best_iteration)
    sub_preds += model.predict(test_df[train_features], num_iteration=model.best_iteration)/n_fold
    del X_train, X_valid, y_train, y_valid,train_data,valid_data    
    gc.collect()
print('AUC is ', roc_auc_score(y, oof_preds))    

Fold 1 started at Mon Apr  1 15:06:57 2019
Training until validation scores don't improve for 3500 rounds.
[2000]	valid_0's auc: 0.877781
[4000]	valid_0's auc: 0.902781
[6000]	valid_0's auc: 0.912999
[8000]	valid_0's auc: 0.918099
[10000]	valid_0's auc: 0.920899
[12000]	valid_0's auc: 0.922555
[14000]	valid_0's auc: 0.923476
[16000]	valid_0's auc: 0.923996
[18000]	valid_0's auc: 0.924292
[20000]	valid_0's auc: 0.924431
[22000]	valid_0's auc: 0.924558
[24000]	valid_0's auc: 0.924634
[26000]	valid_0's auc: 0.924543
Early stopping, best iteration is:
[23315]	valid_0's auc: 0.924662
Fold 2 started at Mon Apr  1 15:39:51 2019
Training until validation scores don't improve for 3500 rounds.
[2000]	valid_0's auc: 0.868286
[4000]	valid_0's auc: 0.896369
[6000]	valid_0's auc: 0.907681
[8000]	valid_0's auc: 0.913678
[10000]	valid_0's auc: 0.91719
[12000]	valid_0's auc: 0.919192
[14000]	valid_0's auc: 0.920474
[16000]	valid_0's auc: 0.921243
[18000]	valid_0's auc: 0.921626
[20000]	valid_0's auc: 0

In [14]:
auc = roc_auc_score(y, oof_preds)
oof_train = oof_preds
oof_test = sub_preds
submission = pd.read_csv("../input/sample_submission.csv")
submission["target"] = oof_test
submission.to_csv("./sub/submission_%s.csv"%auc, index=False)

train_prob  = pd.DataFrame(oof_train)
train_prob.columns = ['class_1']
train_prob.to_csv("./oof/lgb_oof_train_%s.csv"%auc,index=False)

test_prob= pd.DataFrame(oof_test)
test_prob.columns = ['class_1']
test_prob.to_csv("./oof/lgb_oof_test_%s.csv"%auc,index=False)
        

In [None]:
feat_importance_ = pd.DataFrame({"feature":train_features, 
                                 "importance":np.mean([clf.feature_importance(importance_type="gain") for clf in clfs],axis=0)})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:100].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 20))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')
    
display_importances(feat_importance_)