In [1]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [2]:
data = pd.read_csv("whole_data.csv")

x_train = pd.read_csv("x_train.csv")
y_train = pd.read_csv("y_train.csv", header=None)
x_test = pd.read_csv("x_test.csv")
y_test = pd.read_csv("y_test.csv", header=None)

print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)
print("x_test shape: ", x_test.shape)
print("y_test shape: ", y_test.shape)

x_train shape:  (100000, 200)
y_train shape:  (100000, 1)
x_test shape:  (100000, 200)
y_test shape:  (100000, 1)


# Feature Engineering - Statistics

In [3]:
# Add sum, min, max... as features
idx = features = x_train.columns.values[:]

for df in [x_test, x_train]:
    df['sum'] = df[idx].sum(axis=1)
    df['min'] = df[idx].min(axis=1)
    df['max'] = df[idx].max(axis=1)
    df['mean'] = df[idx].mean(axis=1)
    df['std'] = df[idx].std(axis=1)
    df['skew'] = df[idx].skew(axis=1)
    df['kurt'] = df[idx].kurtosis(axis=1)
    df['med'] = df[idx].median(axis=1)
    df['ma'] =  df[idx].apply(lambda x: np.ma.average(x), axis=1)
    df['perc_5'] =  df[idx].apply(lambda x: np.percentile(x, 5), axis=1)
    df['perc_10'] =  df[idx].apply(lambda x: np.percentile(x, 10), axis=1)
    df['perc_25'] =  df[idx].apply(lambda x: np.percentile(x, 25), axis=1)
    df['perc_50'] =  df[idx].apply(lambda x: np.percentile(x, 50), axis=1)
    df['perc_75'] =  df[idx].apply(lambda x: np.percentile(x, 75), axis=1)
    df['perc_85'] =  df[idx].apply(lambda x: np.percentile(x, 85), axis=1)
    df['perc_95'] =  df[idx].apply(lambda x: np.percentile(x, 95), axis=1)
    df['perc_99'] =  df[idx].apply(lambda x: np.percentile(x, 99), axis=1)

In [4]:
print('x_train.shape', x_train.shape)
print('x_test.shape', x_test.shape)
print('y_train.shape', y_train.shape)
print('y_test.shape', y_test.shape)
print('features.len', len(features))

x_train.shape (100000, 217)
x_test.shape (100000, 217)
y_train.shape (100000, 1)
y_test.shape (100000, 1)
features.len 200


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils import shuffle
    
def train_test_split(data):
    
    whole_data = shuffle(data)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
    
    for train_index, test_index in sss.split(whole_data, whole_data['target'].values):
        print("TRAIN:", train_index, "TEST:", test_index)
        train, test = whole_data.iloc[train_index], whole_data.iloc[test_index]
    
    x_train = train.iloc[:,2:]
    y_train = train['target']
    x_test = test.iloc[:,2:]
    y_test = test['target']
    features = [column for column in train.columns if column not in ['ID_code', 'target']]
    
    print("x_train shape: ", x_train.shape)
    print("y_train shape: ", y_train.shape)
    print("x_test shape: ", x_test.shape)
    print("y_test shape: ", y_test.shape)
    print("features", len(features))
    
    print(y_train[y_train==0].shape)
    print(y_train[y_train==1].shape)
    print("-----------------------------")
    print(y_test[y_test==0].shape)
    print(y_test[y_test==1].shape)
    
    return x_train, y_train, x_test, y_test

In [7]:
import time
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb


def eda_modle_eva (params, x_train, y_train, x_test):
    num_folds = 5
    
    folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=44000)
    oof = np.zeros(len(x_train))
    predictions = np.zeros(len(x_test))
    feature_importance_df = pd.DataFrame()
    
    features = [column for column in x_train.columns if column not in ['ID_code', 'target']]
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(x_train.iloc[trn_idx][features], label=y_train.iloc[trn_idx])
        val_data = lgb.Dataset(x_train.iloc[val_idx][features], label=y_train.iloc[val_idx])
        
        num_round = 1000000
        clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3500)
        oof[val_idx] = clf.predict(x_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
        clf.save_model('{:03d}.txt'.format(fold_))

        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
        predictions += clf.predict(x_test[features], num_iteration=clf.best_iteration) / folds.n_splits
    
    return predictions, oof, fold_importance_df, clf

In [8]:
# ALl default values
params = {
    # Core Parameters
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'tree_learner': 'serial',
    'num_threads': 0,
    'seed': 42,
    
    # Learning Control Parameters
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'min_sum_hessian_in_leaf': 1e-3,  
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'feature_fraction': 1.0,
    'lambda_l1': 0.0,
    'lambda_l2': 0.0,
    'bagging_seed': 42,
    
    # Others
    'verbosity ': 1,
    'boost_from_average': True,
    'metric': 'auc',
}

predictions, oof, fold_importance_df, clf = eda_modle_eva(params, x_train, y_train, x_test)

Fold 0
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.96117	valid_1's auc: 0.869474
[2000]	training's auc: 0.987171	valid_1's auc: 0.882404
[3000]	training's auc: 0.996267	valid_1's auc: 0.885565
[4000]	training's auc: 0.998927	valid_1's auc: 0.885669
[5000]	training's auc: 0.999688	valid_1's auc: 0.885642
[6000]	training's auc: 0.999895	valid_1's auc: 0.885239
[7000]	training's auc: 0.999959	valid_1's auc: 0.884706
Early stopping, best iteration is:
[3616]	training's auc: 0.998278	valid_1's auc: 0.88587
Fold 1
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.960572	valid_1's auc: 0.876451
[2000]	training's auc: 0.986973	valid_1's auc: 0.89002
[3000]	training's auc: 0.99624	valid_1's auc: 0.893014
[4000]	training's auc: 0.998923	valid_1's auc: 0.893056
[5000]	training's auc: 0.999676	valid_1's auc: 0.892869
[6000]	training's auc: 0.99989	valid_1's auc: 0.892503
Early stopping, best iteration is:
[3285]	tr

In [9]:
from sklearn.metrics import roc_auc_score
print("CV score: {:<8.5f}".format(roc_auc_score(y_train, oof)))
print("CV score: {:<8.5f}".format(roc_auc_score(y_test, predictions)))

CV score: 0.88650 
CV score: 0.89391 
