<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#Split-Train/Test/Validation" data-toc-modified-id="Split-Train/Test/Validation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Split Train/Test/Validation</a></span></li><li><span><a href="#Baseline-model" data-toc-modified-id="Baseline-model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Baseline model</a></span></li><li><span><a href="#Models-Training" data-toc-modified-id="Models-Training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Models Training</a></span><ul class="toc-item"><li><span><a href="#Light-Gradient-Boosting-Machine-(LGBM)" data-toc-modified-id="Light-Gradient-Boosting-Machine-(LGBM)-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Light Gradient Boosting Machine (LGBM)</a></span></li></ul></li></ul></div>

# Libraries

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

# Split Train/Test/Validation

In [2]:
data = pd.read_csv('data/eda_dataset_imputed.csv')

app_train, test_1 = train_test_split(data, test_size=0.30, random_state=64)
app_test, app_validation = train_test_split(test_1, test_size=0.5, random_state=56)

In [3]:
app_train.shape, app_test.shape, app_validation.shape

((215257, 450), (46127, 450), (46127, 450))

# Models Training

In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.utils.multiclass import type_of_target

import gc

def model(model_func, features, test_features, params, validation_features=None, n_folds = 5): 
    """Train, test and validation a model using cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        
    Return
    --------
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    if validation_features is not None:
        validation_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = np.array(features['TARGET'].astype(int))
    test_labels = np.array(test_features['TARGET'].astype(int))
    if validation_features is not None:
        validation_labels = np.array(validation_features['TARGET'].astype(int))
    
    # Remove the ids and target
    features = features.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    test_features = test_features.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    if validation_features is not None:
        validation_features = validation_features.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    if validation_features is not None:
        print('Validation Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    if validation_features is not None:
        validation_features = np.array(validation_features)
        
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    if validation_features is not None:
        validation_predictions = np.zeros(validation_features.shape[0])
        
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores_auc = []
    train_scores_auc = []
    valid_scores_mae = []
    train_scores_mae = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        valid_score_auc, train_score_auc, valid_score_mae, train_score_mae, feature_importance_values_l, test_predictions_l, validation_predictions_l = model_func(train_features, train_labels, valid_features, valid_labels, test_features, validation_features, params)

        # Record the feature importances
        feature_importance_values += feature_importance_values_l / k_fold.n_splits

        # Make predictions
        test_predictions += test_predictions_l / k_fold.n_splits
        if validation_features is not None:
            validation_predictions += validation_predictions_l / k_fold.n_splits
        
        valid_scores_auc.append(valid_score_auc)
        train_scores_auc.append(train_score_auc)
        valid_scores_mae.append(valid_score_mae)
        train_scores_mae.append(train_score_mae)
        
        # Clean up memory
        gc.enable()
        del train_features, valid_features
        gc.collect()
        
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(test_labels, test_predictions)
    valid_mae = mean_absolute_error(test_labels, test_predictions)
    if validation_features is not None:
        validation_auc = roc_auc_score(validation_labels, validation_predictions)
        validation_mae = mean_absolute_error(validation_labels, validation_predictions)
    
    # Add the overall scores to the metrics
    valid_scores_auc.append(valid_auc)
    train_scores_auc.append(np.mean(train_scores_auc))
    valid_scores_mae.append(valid_mae)
    train_scores_mae.append(np.mean(train_scores_mae))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train auc': train_scores_auc,
                            'valid auc': valid_scores_auc,
                            'train mae': train_scores_mae,
                            'valid mae': valid_scores_mae}) 

    if validation_features is not None:
        validation_metrics = pd.DataFrame({'auc': [validation_auc],
                                           'mae': [validation_mae]}) 
    else:
        validation_metrics = None
        

    return feature_importances, metrics, validation_metrics

### Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

from sklearn import metrics
from keras import backend as K
import tensorflow as tf

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

def train_NNClassifier(train_features, train_labels, valid_features, valid_labels, test_features, validation_features, params):
    # Create the model
    print( 'Setting up neural network...' )
    nn = Sequential()
    nn.add(Dense(units = params['layer1'] , input_dim = 448))
    nn.add(PReLU())
    nn.add(Dropout(params['dropout']))
    nn.add(Dense(units = params['layer2'] ))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(params['dropout']))
    nn.add(Dense(units = params['layer3']))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(params['dropout']))
    nn.add(Dense(units = params['layer4']))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(params['dropout']))
    nn.add(Dense(units = params['layer5']))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(params['dropout']))
    nn.add(Dense(1, activation='sigmoid'))
    nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mae', auc])
    
    # Callbacks
    callbacks = [EarlyStopping(monitor='val_loss', patience=100)]

    # Train the model
    nn.fit(train_features, train_labels, 
           validation_data=(valid_features, valid_labels), 
           epochs=params['epochs'], 
           callbacks=callbacks,
           batch_size=32,
           verbose=2)

    # Record the feature importances
    feature_importance_values = np.zeros(train_features.shape[1])
    
    # Make predictions
    test_predictions = nn.predict(test_features).flatten().clip(0,1)
    if validation_features is not None:
        validation_predictions = nn.predict(validation_features).flatten().clip(0,1) 
    else:
        validation_predictions=None
        
    # Record the best score
    t_p = nn.predict(train_features).flatten().clip(0,1)
    v_p = nn.predict(valid_features).flatten().clip(0,1)
    
    # Record the best score
    valid_score_auc = roc_auc_score(valid_labels, v_p)
    train_score_auc = roc_auc_score(train_labels, t_p)
    valid_score_mae = mean_absolute_error(valid_labels, v_p)
    train_score_mae = mean_absolute_error(train_labels, t_p)

    return valid_score_auc, train_score_auc, valid_score_mae, train_score_mae, feature_importance_values, test_predictions, validation_predictions


Using TensorFlow backend.
  return f(*args, **kwds)


In [None]:
params = {
    "layer1": 400,
    "layer2": 160,
    "layer3": 64,
    "layer4": 26,
    "layer5": 12,
    "epochs": 1000,
    "dropout": 0.3
}

fi, metrics, validation_metrics = model(train_NNClassifier, app_train, app_test, params=params)
print('Baseline metrics')
print(metrics)

Training Data Shape:  (215257, 448)
Testing Data Shape:  (46127, 448)
Setting up neural network...
Train on 172205 samples, validate on 43052 samples
Epoch 1/1000
 - 70s - loss: 0.3151 - mean_absolute_error: 0.1743 - auc: 0.5058 - val_loss: 0.2791 - val_mean_absolute_error: 0.1499 - val_auc: 0.5052
Epoch 2/1000
 - 68s - loss: 0.2840 - mean_absolute_error: 0.1519 - auc: 0.5053 - val_loss: 0.2786 - val_mean_absolute_error: 0.1518 - val_auc: 0.5063
Epoch 3/1000
 - 68s - loss: 0.2821 - mean_absolute_error: 0.1508 - auc: 0.5080 - val_loss: 0.2779 - val_mean_absolute_error: 0.1505 - val_auc: 0.5100
Epoch 4/1000
 - 68s - loss: 0.2813 - mean_absolute_error: 0.1502 - auc: 0.5117 - val_loss: 0.2776 - val_mean_absolute_error: 0.1464 - val_auc: 0.5135
Epoch 5/1000
 - 68s - loss: 0.2806 - mean_absolute_error: 0.1495 - auc: 0.5159 - val_loss: 0.2774 - val_mean_absolute_error: 0.1463 - val_auc: 0.5179
Epoch 6/1000
 - 69s - loss: 0.2804 - mean_absolute_error: 0.1492 - auc: 0.5199 - val_loss: 0.2774 - 

Epoch 55/1000
 - 67s - loss: 0.2771 - mean_absolute_error: 0.1474 - auc: 0.5749 - val_loss: 0.2755 - val_mean_absolute_error: 0.1448 - val_auc: 0.5750
Epoch 56/1000
 - 66s - loss: 0.2773 - mean_absolute_error: 0.1474 - auc: 0.5752 - val_loss: 0.2759 - val_mean_absolute_error: 0.1471 - val_auc: 0.5753
Epoch 57/1000
 - 68s - loss: 0.2771 - mean_absolute_error: 0.1477 - auc: 0.5755 - val_loss: 0.2760 - val_mean_absolute_error: 0.1445 - val_auc: 0.5756
Epoch 58/1000
 - 66s - loss: 0.2772 - mean_absolute_error: 0.1475 - auc: 0.5758 - val_loss: 0.2754 - val_mean_absolute_error: 0.1464 - val_auc: 0.5760
Epoch 59/1000
 - 69s - loss: 0.2772 - mean_absolute_error: 0.1476 - auc: 0.5761 - val_loss: 0.2755 - val_mean_absolute_error: 0.1464 - val_auc: 0.5763
Epoch 60/1000
 - 68s - loss: 0.2770 - mean_absolute_error: 0.1477 - auc: 0.5764 - val_loss: 0.2757 - val_mean_absolute_error: 0.1452 - val_auc: 0.5766
Epoch 61/1000
 - 69s - loss: 0.2768 - mean_absolute_error: 0.1474 - auc: 0.5767 - val_loss: 0.

Epoch 110/1000
 - 66s - loss: 0.2750 - mean_absolute_error: 0.1473 - auc: 0.5898 - val_loss: 0.2747 - val_mean_absolute_error: 0.1423 - val_auc: 0.5900
Epoch 111/1000
 - 65s - loss: 0.2749 - mean_absolute_error: 0.1468 - auc: 0.5901 - val_loss: 0.2745 - val_mean_absolute_error: 0.1459 - val_auc: 0.5902
Epoch 112/1000
 - 65s - loss: 0.2750 - mean_absolute_error: 0.1471 - auc: 0.5903 - val_loss: 0.2745 - val_mean_absolute_error: 0.1456 - val_auc: 0.5904
Epoch 113/1000
 - 69s - loss: 0.2749 - mean_absolute_error: 0.1468 - auc: 0.5905 - val_loss: 0.2749 - val_mean_absolute_error: 0.1428 - val_auc: 0.5906
Epoch 114/1000
 - 65s - loss: 0.2749 - mean_absolute_error: 0.1469 - auc: 0.5907 - val_loss: 0.2747 - val_mean_absolute_error: 0.1452 - val_auc: 0.5908
Epoch 115/1000
 - 69s - loss: 0.2747 - mean_absolute_error: 0.1469 - auc: 0.5910 - val_loss: 0.2749 - val_mean_absolute_error: 0.1444 - val_auc: 0.5911
Epoch 116/1000
 - 67s - loss: 0.2748 - mean_absolute_error: 0.1469 - auc: 0.5912 - val_l

Epoch 164/1000
 - 66s - loss: 0.2733 - mean_absolute_error: 0.1466 - auc: 0.5996 - val_loss: 0.2749 - val_mean_absolute_error: 0.1404 - val_auc: 0.5997
Epoch 165/1000
 - 67s - loss: 0.2738 - mean_absolute_error: 0.1463 - auc: 0.5998 - val_loss: 0.2744 - val_mean_absolute_error: 0.1487 - val_auc: 0.5998
Epoch 166/1000
 - 68s - loss: 0.2735 - mean_absolute_error: 0.1466 - auc: 0.5999 - val_loss: 0.2758 - val_mean_absolute_error: 0.1475 - val_auc: 0.6000
Epoch 167/1000
 - 66s - loss: 0.2733 - mean_absolute_error: 0.1465 - auc: 0.6001 - val_loss: 0.2753 - val_mean_absolute_error: 0.1421 - val_auc: 0.6001
Epoch 168/1000
 - 74s - loss: 0.2733 - mean_absolute_error: 0.1464 - auc: 0.6002 - val_loss: 0.2751 - val_mean_absolute_error: 0.1416 - val_auc: 0.6003
Epoch 169/1000
 - 69s - loss: 0.2733 - mean_absolute_error: 0.1464 - auc: 0.6004 - val_loss: 0.2745 - val_mean_absolute_error: 0.1438 - val_auc: 0.6004
Epoch 170/1000
 - 69s - loss: 0.2733 - mean_absolute_error: 0.1461 - auc: 0.6005 - val_l

Epoch 2/1000
 - 68s - loss: 0.2830 - mean_absolute_error: 0.1513 - auc: 0.5007 - val_loss: 0.2815 - val_mean_absolute_error: 0.1445 - val_auc: 0.5036
Epoch 3/1000
 - 68s - loss: 0.2809 - mean_absolute_error: 0.1499 - auc: 0.5066 - val_loss: 0.2815 - val_mean_absolute_error: 0.1548 - val_auc: 0.5095
Epoch 4/1000
 - 68s - loss: 0.2804 - mean_absolute_error: 0.1496 - auc: 0.5118 - val_loss: 0.2812 - val_mean_absolute_error: 0.1535 - val_auc: 0.5136
Epoch 5/1000
 - 69s - loss: 0.2799 - mean_absolute_error: 0.1493 - auc: 0.5157 - val_loss: 0.2811 - val_mean_absolute_error: 0.1472 - val_auc: 0.5171
Epoch 6/1000
 - 68s - loss: 0.2796 - mean_absolute_error: 0.1485 - auc: 0.5187 - val_loss: 0.2810 - val_mean_absolute_error: 0.1507 - val_auc: 0.5202
Epoch 7/1000
 - 68s - loss: 0.2793 - mean_absolute_error: 0.1484 - auc: 0.5215 - val_loss: 0.2807 - val_mean_absolute_error: 0.1488 - val_auc: 0.5231
Epoch 8/1000
 - 68s - loss: 0.2791 - mean_absolute_error: 0.1484 - auc: 0.5245 - val_loss: 0.2806 - 

In [None]:
params = {
    "layer1": 600,
    "layer2": 300,
    "layer3": 150,
    "layer4": 80,
    "layer5": 20,
    "epochs": 1000,
    "dropout": 0.3
}

fi, metrics, validation_metrics = model(train_NNClassifier, app_train, app_test, params=params)
print('Baseline metrics')
print(metrics)