In [188]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import lightgbm as lgb
import os
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

pd.options.display.max_columns = 500
#import pandas_profiling

In [189]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [190]:
random_state = 42
np.random.seed(random_state)

In [191]:

def augmentation(x=None,y=None,upsample_times=2):
    x_pos = x[y==1].copy()
    y_pos = y[y==1].copy()
    
    aug_x_pos = x_pos.copy()
    
    if upsample_times ==0:
        x_pos_temp = np.zeros(x_pos.shape)
        for i in range(x_pos.shape[1]):
            pos_values = x_pos.iloc[:,i].values
            np.random.shuffle(pos_values)
            x_pos_temp[:,i] = pos_values

        x_pos_temp = pd.DataFrame(x_pos_temp)
        x_pos_temp.columns = x_pos.columns  
        aug_x_pos = x_pos_temp    
        aug_y_pos = np.ones(y_pos.shape[0]) 
        aug_x_pos = aug_x_pos.append(x[y==0]) 
        aug_y_pos = np.append(aug_y_pos,y[y==0]) 
    else:
        for n in range(upsample_times):
            x_pos_temp = np.zeros(x_pos.shape)

            for i in range(x_pos.shape[1]):
                pos_values = x_pos.iloc[:,i].values
                np.random.shuffle(pos_values)
                x_pos_temp[:,i] = pos_values

            x_pos_temp = pd.DataFrame(x_pos_temp)
            x_pos_temp.columns = x_pos.columns  
            aug_x_pos = aug_x_pos.append(x_pos_temp)    

        aug_y_pos = np.ones(y_pos.shape[0]*(upsample_times+1)) 

        aug_x_pos = aug_x_pos.append(x[y==0]) 
        aug_y_pos = np.append(aug_y_pos,y[y==0]) 
        
    return aug_x_pos,aug_y_pos
    
    
    
    

In [192]:
# train_df[var_list] = np.exp(train_df[var_list])
# test_df[var_list] = np.exp(test_df[var_list])

In [204]:
n_folds = 5
random_seed = 26
upsample_times = 12
model = f'augment_{upsample_times}_times'

model_name = "{0}_{1}_folds".format(model, n_folds)
print("Model: {}".format(model_name))

Model: augment_12_times_5_folds


In [205]:
#exclusion = ['ID_code', 'target']+ rank_var_list
exclusion = ['ID_code', 'target'] 
feats = [c for c in train_df.columns if c not in exclusion]

In [206]:
clfs = []
folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
oof_preds = np.zeros((len(train_df), 1))
test_preds = np.zeros((len(test_df), 1))


X = train_df[feats]
y = train_df['target']
X_test = test_df[feats]
test_ids = test_df.ID_code.values


parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    #'scale_pos_weight': 400,
    #'device' : 'gpu' ,
    'boosting': 'gbdt',
    'num_leaves': 31, #31
    'feature_fraction': 0.5,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'learning_rate': 0.05, #0.05
    'verbose': 30
    #'min_data_in_leaf': 200
}

feature_importance_df = pd.DataFrame()
for fold_, (trn_, val_) in enumerate(folds.split(X, y)):
    print("Current Fold: {}".format(fold_+1))
    trn_x, trn_y = X.iloc[trn_, :], y[trn_]
    val_x, val_y = X.iloc[val_, :], y[val_]
    trn_x, trn_y = augmentation(trn_x,trn_y,upsample_times)

    trn_lgb = lgb.Dataset(trn_x, trn_y)
    val_lgb = lgb.Dataset(val_x, val_y)
    clf = lgb.train(parameters,
                     train_set=trn_lgb,
                     #valid_sets=[valid_data_lgb,holdout_data_lgb],
                     valid_sets=[trn_lgb, val_lgb],
                     num_boost_round=3000,
                     early_stopping_rounds=50,
                     verbose_eval=False)
    


    val_pred = clf.predict(val_x)
    test_fold_pred = clf.predict(X_test)

    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    oof_preds[val_, :] = val_pred.reshape((-1, 1))
    test_preds += test_fold_pred.reshape((-1, 1))
    
   # print('getting feature importance')
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    
test_preds /= n_folds
roc_score = roc_auc_score(y, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))


print("Saving submission file")
sample = pd.read_csv('../data/sample_submission.csv')
sample.target = test_preds.astype(float)
sample.ID_code = test_ids
sample.to_csv('../submissions/{}_{}.csv'.format(model_name,str(roc_score)), index=False)

#display_importances(feature_importance_df)


Current Fold: 1
AUC = 0.8973327478944458
Current Fold: 2
AUC = 0.8954601661656423
Current Fold: 3
AUC = 0.8999342503712677
Current Fold: 4
AUC = 0.9002596892110999
Current Fold: 5
AUC = 0.8983123935624849
Overall AUC = 0.8982546281301024
Saving submission file


In [202]:
%%javascript
var nb = IPython.notebook;
var kernel = IPython.notebook.kernel;
var command = "NOTEBOOK_FULL_PATH = '" + nb.base_url + nb.notebook_path + "'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [203]:


shutil.copyfile(os.path.basename(NOTEBOOK_FULL_PATH), 
                             '../models/{}_{}.ipynb'.format(model_name, str(roc_score)))


'../models/augment_9_times_5_folds_0.8977223362480411.ipynb'