# Deep Learning / LGBM for unbalanced datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from keras import optimizers
from keras import backend as K
import keras as k
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [25]:
%%time
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print('Training df shape', df_train.shape)
print('Test df shape', df_test.shape)

Training df shape (200000, 202)
Test df shape (200000, 201)
Wall time: 30 s


In [26]:
per = len(df_train[df_train['target']>0]) / df_train.shape[0]
print('Percentage of transactions made : ', per*100)

Percentage of transactions made :  10.049


In [27]:
df_ones = df_train[df_train['target']>0]
print('Ones : ', df_ones.shape[0])
df_zeros = df_train[df_train['target']==0].sample(frac=0.109)
print('Zeros : ', df_zeros.shape[0])
df_sampling = pd.concat([df_ones, df_zeros])
print(df_sampling.shape)

Ones :  20098
Zeros :  19609
(39707, 202)


# Simple Feature Engineering

In [28]:
%%time
idx = features = df_train.columns.values[2:202]
for i, df in enumerate([df_train, df_test, df_sampling]):
    df['sum'] = df[idx].sum(axis=1)
    df['min'] = df[idx].sum(axis=1)
    df['max'] = df[idx].max(axis=1)
    df['mean'] = df[idx].mean(axis=1)
    df['std'] = df[idx].std(axis=1)
    df['skew'] = df[idx].skew(axis=1)
    df['kurt'] = df[idx].kurtosis(axis=1)
    df['med'] = df[idx].median(axis=1)
    
    print('Creating percentiles features for df: {}/{}'.format(i+1,3))
    df['perc_5'] =  df[idx].apply(lambda x: np.percentile(x, 10), axis=1)
    df['perc_10'] =  df[idx].apply(lambda x: np.percentile(x, 10), axis=1)
    df['perc_25'] =  df[idx].apply(lambda x: np.percentile(x, 25), axis=1)
    df['perc_50'] =  df[idx].apply(lambda x: np.percentile(x, 50), axis=1)
    df['perc_75'] =  df[idx].apply(lambda x: np.percentile(x, 75), axis=1)
    df['perc_95'] =  df[idx].apply(lambda x: np.percentile(x, 99), axis=1)
    df['perc_99'] =  df[idx].apply(lambda x: np.percentile(x, 99), axis=1)    

Creating percentiles features for df: 1/3
Creating percentiles features for df: 2/3
Creating percentiles features for df: 3/3
Wall time: 11min 22s


In [31]:
X = df_train.iloc[:,2:]
y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6666)

X_smple = df_sampling.iloc[:,2:]
y_smple = df_sampling['target']
X_train_smple, X_test_smple, y_train_smple, y_test_smple = train_test_split(X_smple, y_smple, test_size=0.3, random_state=66)

# 1.0 LGBM Model

In [34]:
#Model LGBM 
def create_model_lgbm(X_train,y_train,X_val=None,y_val=None):
    dtrain = lgb.Dataset(X_train,label=y_train)
    dval = lgb.Dataset(X_val,label=y_val)
    param = {
    'bagging_freq': 3,
    'bagging_fraction': 0.5,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.1,
    'learning_rate': 0.01,
    'max_depth': 2,  
    'metric':'auc',
    'min_data_in_leaf': 100,
    'num_leaves': 35,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
    }
    if not X_val is None:
        valid_sets = (dtrain,dval)
        valid_names = ['train','valid']
    else:
        valid_sets = (dtrain)
        valid_names = ['train']
    model = lgb.train(param,dtrain,num_boost_round=50000,valid_sets=valid_sets,valid_names=['train','valid'],
                      verbose_eval=1000,
                     early_stopping_rounds=1000)
    return model

lgbm_X = X
lgbm_y = y
lgbm_test_x = X_test
lgbm_test_y = y_test
val_pred = np.zeros(len(lgbm_X))
full_pred = np.zeros(len(X))
test_pred_lgbm = np.zeros(len(lgbm_test_y))
target_pred = np.zeros(len(df_test))
kf = KFold(n_splits=5,random_state=67)
for _fold, (trn_idx, val_idx) in enumerate(kf.split(lgbm_X.values, lgbm_y.values)):   
        Xtrn, ytrn = lgbm_X.iloc[trn_idx], lgbm_y.iloc[trn_idx]
        Xval, y_val = lgbm_X.iloc[val_idx], lgbm_y.iloc[val_idx]
        print("Fold num:{}".format(_fold + 1))
        clf = create_model_lgbm(Xtrn,ytrn,Xval,y_val)
        val_pred[val_idx] = clf.predict(lgbm_X.iloc[val_idx])
        test_pred_lgbm  += clf.predict(lgbm_test_x) / kf.n_splits
        full_pred += clf.predict(X) / kf.n_splits
        target_pred += clf.predict(df_test.iloc[:,1:]) / kf.n_splits
    
print('Full Training Data Score : ', roc_auc_score(y, full_pred))
print('Val CV score : ', roc_auc_score(lgbm_y, val_pred))
print('Test CV score : ', roc_auc_score(lgbm_test_y, test_pred_lgbm))

Fold num:1
Training until validation scores don't improve for 1000 rounds.
[1000]	train's auc: 0.835568	valid's auc: 0.822852
[2000]	train's auc: 0.868401	valid's auc: 0.852864
[3000]	train's auc: 0.884656	valid's auc: 0.867356
[4000]	train's auc: 0.894401	valid's auc: 0.875657
[5000]	train's auc: 0.900871	valid's auc: 0.881049
[6000]	train's auc: 0.905744	valid's auc: 0.885144
[7000]	train's auc: 0.909325	valid's auc: 0.888263
[8000]	train's auc: 0.912164	valid's auc: 0.8906
[9000]	train's auc: 0.914421	valid's auc: 0.89245
[10000]	train's auc: 0.916374	valid's auc: 0.893909
[11000]	train's auc: 0.917959	valid's auc: 0.895066
[12000]	train's auc: 0.919378	valid's auc: 0.896049
[13000]	train's auc: 0.920671	valid's auc: 0.896709
[14000]	train's auc: 0.921794	valid's auc: 0.897166
[15000]	train's auc: 0.922837	valid's auc: 0.897622
[16000]	train's auc: 0.923792	valid's auc: 0.89797
[17000]	train's auc: 0.924719	valid's auc: 0.898186
[18000]	train's auc: 0.925611	valid's auc: 0.898331
[1

KeyboardInterrupt: 

In [None]:
num_features = 60
indxs = np.argsort(clf.feature_importance())[:num_features]
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importance()[indxs], 
                                      X.columns[indxs])), columns=['Value','Feature'])

plt.figure(figsize=(20,10))
sns.barplot(x='Value', y='Feature', data=feature_imp.sort_values(by='Value', ascending=False))
plt.title('Top {} LightGBM Features'.format(num_features))
plt.tight_layout()
plt.show()

# 2.0 Keras DNN Model

In [None]:
from keras.kallbacks import EarlyStopping

def auc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

def create_model_nn(in_dim, layer_size=120):
    model = Sequential()
    model.add(Dense(layer_size, input_dim=in_dim))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(layer_size))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(layer_size))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(layer_size))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(1, activation='sigmoid'))
    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics = [auc])
    
    return model

model_nn = create_model_nn(X_train.shape[1])
callback = EarlyStopping('val_loss', patience=6, verbose=0, mode='auto')
history = model_nn.fit(X_train, y_train, 
                       validation_data = [X_test, y_test], 
                       epochs=100, batch_size=256, 
                       verbose=1, callbacks = [callback])

target_pred_nn = model_nn.predict(df_test.iloc[:,1:])[:,0]
print('\nNon-Sampled Validation Max score : {}'.format(np.max(history.history['val_auc'])))