In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('data/eda_dataset_imputed.csv')

app_train, test_1 = train_test_split(data, test_size=0.30, random_state=64)
app_test, app_validation = train_test_split(test_1, test_size=0.5, random_state=56)

In [3]:
train_labels = app_train['TARGET']
test_labels = app_test['TARGET']

In [4]:
from sklearn.preprocessing import MinMaxScaler, Imputer

# Drop the target from the training data
if 'TARGET' in app_train:
    train = app_train.drop(['TARGET'], axis=1)
else:
    train = app_train.copy()

    
if 'TARGET' in app_test:
    test = app_test.drop(['TARGET'], axis=1)
else:
    test = app_test.copy()

    
# Feature names
features = list(train.columns)

# Median imputation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (215257, 449)
Testing data shape:  (46127, 449)


In [80]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

import gc

def modelNeuralNetwork(features, test_features, params, n_folds = 5): 
    """Train and test a Neural Network model using cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        
    Return
    --------
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = np.array(features['TARGET'].astype(int))
    
    # Remove the ids and target
    features = features.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    test_features = test_features.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores_auc = []
    train_scores_auc = []
    valid_scores_mae = []
    train_scores_mae = []
    
    # Iterate through each fold
    i = 0
    for train_indices, valid_indices in k_fold.split(features):
        i+=1
        print("Training model", i)
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        print( 'Setting up neural network...' )
        nn = Sequential()
        nn.add(Dense(units = 400 , input_dim = 448))
        nn.add(PReLU())
        nn.add(Dropout(params['dropout']))
        nn.add(Dense(units = 160 ))
        nn.add(PReLU())
        nn.add(BatchNormalization())
        nn.add(Dropout(params['dropout']))
        nn.add(Dense(units = 64))
        nn.add(PReLU())
        nn.add(BatchNormalization())
        nn.add(Dropout(params['dropout']))
        nn.add(Dense(units = 26))
        nn.add(PReLU())
        nn.add(BatchNormalization())
        nn.add(Dropout(params['dropout']))
        nn.add(Dense(units = 12))
        nn.add(PReLU())
        nn.add(BatchNormalization())
        nn.add(Dropout(params['dropout']))
        nn.add(Dense(1, activation='sigmoid'))
        nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mae'])
        
        # Callbacks
        callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
        
        # Train the model
        nn.fit(train_features, train_labels, 
               validation_data=(valid_features, valid_labels), 
               epochs=params['epochs'], 
               callbacks=callbacks,
               verbose=2)
        
        # Record the best iteration
        # best_iteration = model.best_iteration
        
        # Record the feature importances
        #feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        #test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        test_predictions += nn.predict(test_features).flatten().clip(0,1) / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = nn.predict(valid_features).flatten().clip(0,1)
        
        t_p = nn.predict(train_features).flatten().clip(0,1)
        v_p = nn.predict(valid_features).flatten().clip(0,1)
        
        # Record the best score
        valid_score_auc = roc_auc_score(valid_labels, v_p)
        train_score_auc = roc_auc_score(train_labels, t_p)
        valid_score_mae = mean_absolute_error(valid_labels, v_p)
        train_score_mae = mean_absolute_error(train_labels, t_p)
        
        valid_scores_auc.append(valid_score_auc)
        train_scores_auc.append(train_score_auc)
        valid_scores_mae.append(valid_score_mae)
        train_scores_mae.append(train_score_mae)
        
        print("valid's mae: ", valid_score_mae,"	valid's auc: ", valid_score_auc, "	train's mae: ", train_score_mae, "	train's auc: ", train_score_auc)
        # Clean up memory
        gc.enable()
        del nn, train_features, valid_features
        gc.collect()
        
    # Make the feature importance dataframe
    #feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    valid_mae = mean_absolute_error(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores_auc.append(valid_auc)
    train_scores_auc.append(np.mean(train_scores_auc))
    valid_scores_mae.append(valid_mae)
    train_scores_mae.append(np.mean(train_scores_mae))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train auc': train_scores_auc,
                            'valid auc': valid_scores_auc,
                            'train mae': train_scores_mae,
                            'valid mae': valid_scores_mae}) 
    
    return None, metrics #feature_importances, metrics

In [52]:
def plot_feature_importances(df):
    """
    Plot importances returned by a model. This can work with any measure of
    feature importance provided that higher importance is better. 
    
    Args:
        df (dataframe): feature importances. Must have the features in a column
        called `features` and the importances in a column called `importance
        
    Returns:
        shows a plot of the 15 most importance features
        
        df (dataframe): feature importances sorted by importance (highest to lowest) 
        with a column for normalized importance
        """
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    return df

In [None]:
params = {
    "epochs": 100,
    "dropout": 0.3
}

fi, metrics = modelNeuralNetwork(app_train, app_test, params=params)
print('Baseline metrics')
print(metrics)


Training Data Shape:  (215257, 448)
Testing Data Shape:  (46127, 448)
Training model 1
Setting up neural network...
Train on 172205 samples, validate on 43052 samples
Epoch 1/100
 - 68s - loss: 0.3147 - mean_absolute_error: 0.1742 - val_loss: 0.2791 - val_mean_absolute_error: 0.1501
Epoch 2/100
 - 66s - loss: 0.2841 - mean_absolute_error: 0.1520 - val_loss: 0.2781 - val_mean_absolute_error: 0.1510
Epoch 3/100
 - 66s - loss: 0.2820 - mean_absolute_error: 0.1508 - val_loss: 0.2781 - val_mean_absolute_error: 0.1524
Epoch 4/100
 - 66s - loss: 0.2811 - mean_absolute_error: 0.1500 - val_loss: 0.2780 - val_mean_absolute_error: 0.1503
Epoch 5/100
 - 66s - loss: 0.2807 - mean_absolute_error: 0.1495 - val_loss: 0.2776 - val_mean_absolute_error: 0.1497
Epoch 6/100
 - 66s - loss: 0.2803 - mean_absolute_error: 0.1493 - val_loss: 0.2776 - val_mean_absolute_error: 0.1425
Epoch 7/100
 - 66s - loss: 0.2802 - mean_absolute_error: 0.1489 - val_loss: 0.2776 - val_mean_absolute_error: 0.1525
Epoch 8/100
 -