In [15]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [16]:
data = pd.read_csv('data/eda_dataset_imputed.csv')

app_train, test_1 = train_test_split(data, test_size=0.30, random_state=64)
app_test, app_validation = train_test_split(test_1, test_size=0.5, random_state=56)

In [17]:
train_labels = app_train['TARGET']
test_labels = app_test['TARGET']

In [27]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.utils.multiclass import type_of_target

import gc

def model(model_funcs, features, test_features, params_funcs, validation_features=None, n_folds = 5): 
    """Train, test and validation a model using cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        
    Return
    --------
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    if validation_features is not None:
        validation_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = np.array(features['TARGET'].astype(int))
    test_labels = np.array(test_features['TARGET'].astype(int))
    if validation_features is not None:
        validation_labels = np.array(validation_features['TARGET'].astype(int))
    
    # Remove the ids and target
    features = features.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    test_features = test_features.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    if validation_features is not None:
        validation_features = validation_features.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    if validation_features is not None:
        print('Validation Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    if validation_features is not None:
        validation_features = np.array(validation_features)
        
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    test_predictions_m = np.zeros(test_features.shape[0])
    if validation_features is not None:
        validation_predictions = np.zeros(validation_features.shape[0])
        validation_predictions_m = np.zeros(validation_features.shape[0])
        
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    test_scores_auc = []
    test_scores_mae = []
    validation_scores_auc = []
    validation_scores_mae = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]

        test_predictions_m = np.zeros(test_features.shape[0])
        if validation_features is not None:
            validation_predictions_m = np.zeros(validation_features.shape[0])
        

        for model_func, params in zip(model_funcs, params_funcs):
            test_predictions_l, validation_predictions_l = model_func(train_features, train_labels, valid_features, valid_labels, test_features, validation_features, params)
            
            test_predictions_m += test_predictions_l/len(model_funcs)
            if validation_features is not None:
                validation_predictions_m += validation_predictions_l/len(model_funcs)
            

        # Make predictions
        test_predictions += test_predictions_m / k_fold.n_splits
        if validation_features is not None:
            validation_predictions += validation_predictions_m / k_fold.n_splits
        
        test_score_auc = roc_auc_score(test_labels, test_predictions_m)
        test_score_mae = mean_absolute_error(test_labels, test_predictions_m)
        validation_score_auc = roc_auc_score(validation_labels, validation_predictions_m)
        validation_score_mae = mean_absolute_error(validation_labels, validation_predictions_m)
        
        test_scores_auc.append(test_score_auc)
        test_scores_mae.append(test_score_mae)
        validation_scores_auc.append(validation_score_auc)
        validation_scores_mae.append(validation_score_mae)
        
        # Clean up memory
        gc.enable()
        del train_features, valid_features
        gc.collect()
        
    # Overall validation score
    test_auc = roc_auc_score(test_labels, test_predictions)
    test_mae = mean_absolute_error(test_labels, test_predictions)
    if validation_features is not None:
        validation_auc = roc_auc_score(validation_labels, validation_predictions)
        validation_mae = mean_absolute_error(validation_labels, validation_predictions)
    
    # Add the overall scores to the metrics
    test_scores_auc.append(test_auc)
    test_scores_mae.append(test_mae)
    validation_scores_auc.append(validation_auc)
    validation_scores_mae.append(validation_mae)
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'test auc': test_scores_auc,
                            'validation auc': validation_scores_auc,
                            'test mae': test_scores_mae,
                            'validation mae': validation_scores_mae}) 

    return metrics

In [24]:
import lightgbm as lgb

def train_LGBMClassifier(train_features, train_labels, valid_features, valid_labels, test_features, validation_features, params):
    # Create the model
    model = lgb.LGBMClassifier(application="binary", boosting_type=params["boosting"],
                      learning_rate=params["learning_rate"],n_estimators=params["n_estimators"],
                      reg_alpha = params["reg_alpha"], reg_lambda = params["reg_lambda"], 
                      drop_rate=params["drop_rate"],
                      num_leaves=params["num_leaves"], max_depth=params["max_depth"],
                      max_bin=params["max_bin"],
                      subsample = 0.8, n_jobs = -1, random_state = 50)
    
    # Train the model
    model.fit(train_features, train_labels, eval_metric = ['auc', 'mae'],
              eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
              eval_names = ['valid', 'train'],
              early_stopping_rounds = params["early_stopping_rounds"], verbose = 200)
    
    # Record the best iteration
    best_iteration = model.best_iteration_
    
    # Record the feature importances
    feature_importance_values = model.feature_importances_
    
    # Make predictions
    test_predictions = model.predict_proba(test_features, num_iteration = best_iteration)[:, 1]
    if validation_features is not None:
        validation_predictions = model.predict_proba(validation_features, num_iteration = best_iteration)[:, 1]
    else:
        validation_predictions=None
        
    # Record the best score
    valid_score_auc = model.best_score_['valid']['auc']
    train_score_auc = model.best_score_['train']['auc']
    valid_score_mae = model.best_score_['valid']['l1']
    train_score_mae = model.best_score_['train']['l1']

    return test_predictions, validation_predictions

In [25]:
import xgboost as xgb
def train_XGBoostClassifier(train_features, train_labels, valid_features, valid_labels, test_features, validation_features, params):
    # Create the model
    model = xgb.XGBClassifier(learning_rate =params["learning_rate"], 
                              n_estimators=params["n_estimators"], 
                              max_depth=params["max_depth"], 
                              min_child_weight=params["min_child_weight"], 
                              subsample=params["subsample"], 
                              colsample_bytree=params["colsample_bytree"], 
                              objective= 'binary:logistic', 
                              nthread=4, 
                              scale_pos_weight=2, 
                              seed=27)
    
    # Train the model
    model.fit(train_features, train_labels, eval_metric = ['mae', 'auc'],
              eval_set = [(train_features, train_labels), (valid_features, valid_labels)],
              #eval_names = ['valid', 'train'],
              early_stopping_rounds = params["early_stopping_rounds"], verbose = 200)
    
    # Record the best iteration
    best_iteration = model.best_iteration
    
    # Record the feature importances
    feature_importance_values = model.feature_importances_
    
    # Make predictions
    test_predictions = model.predict_proba(test_features)[:, 1]
    if validation_features is not None:
        validation_predictions = model.predict_proba(validation_features)[:, 1]
    else:
        validation_predictions=None
        
    # Record the best score
    valid_score_auc = model.evals_result()['validation_1']['auc'][best_iteration]
    train_score_auc = model.evals_result()['validation_0']['auc'][best_iteration]
    valid_score_mae = model.evals_result()['validation_1']['mae'][best_iteration]
    train_score_mae = model.evals_result()['validation_0']['mae'][best_iteration]

    return test_predictions, validation_predictions


In [30]:
lgbm_params = {
    "boosting":"gbdt",
    "application":"binary",
    "learning_rate": 0.01,
    'reg_alpha':10,
    'reg_lambda': 10,
    "n_estimators":10000,
    "max_depth":5,
    "num_leaves":200,
    "max_bin":225,
    "drop_rate":0.02,
    "early_stopping_rounds": 500
}

xgboost_params = {
    "learning_rate": 0.01,
    "n_estimators": 10000,
    "max_depth":5,
    "min_child_weight": 40,
    "subsample": 0.7, 
    "colsample_bytree": 0.6,
    "early_stopping_rounds": 500
}

models = [train_LGBMClassifier, train_XGBoostClassifier]
params = [lgbm_params, xgboost_params]

metrics = model(models, app_train, app_test, params_funcs=params, validation_features=app_validation)
print(metrics)

Training Data Shape:  (215257, 448)
Testing Data Shape:  (46127, 448)
Validation Data Shape:  (46127, 448)
Training until validation scores don't improve for 500 rounds.
[200]	valid's l1: 0.140336	valid's auc: 0.742289	train's l1: 0.140712	train's auc: 0.746658
[400]	valid's l1: 0.138343	valid's auc: 0.750942	train's l1: 0.138334	train's auc: 0.761237
[600]	valid's l1: 0.1374	valid's auc: 0.754757	train's l1: 0.137016	train's auc: 0.769987
[800]	valid's l1: 0.136813	valid's auc: 0.756723	train's l1: 0.136048	train's auc: 0.776413
[1000]	valid's l1: 0.136445	valid's auc: 0.758149	train's l1: 0.135332	train's auc: 0.781594
[1200]	valid's l1: 0.136231	valid's auc: 0.758549	train's l1: 0.134781	train's auc: 0.786496
[1400]	valid's l1: 0.136066	valid's auc: 0.758931	train's l1: 0.134288	train's auc: 0.790999
[1600]	valid's l1: 0.135911	valid's auc: 0.759173	train's l1: 0.133824	train's auc: 0.795316
[1800]	valid's l1: 0.135798	valid's auc: 0.759327	train's l1: 0.133397	train's auc: 0.799566

[3400]	valid's l1: 0.134774	valid's auc: 0.745401	train's l1: 0.130168	train's auc: 0.833733
[3600]	valid's l1: 0.134702	valid's auc: 0.745393	train's l1: 0.129788	train's auc: 0.837357
[3800]	valid's l1: 0.134654	valid's auc: 0.745434	train's l1: 0.129435	train's auc: 0.840917
[4000]	valid's l1: 0.134609	valid's auc: 0.745391	train's l1: 0.129072	train's auc: 0.844241
Early stopping, best iteration is:
[3668]	valid's l1: 0.134684	valid's auc: 0.745448	train's l1: 0.129669	train's auc: 0.838521
[0]	validation_0-mae:0.49688	validation_0-auc:0.719306	validation_1-mae:0.496875	validation_1-auc:0.702367
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 500 rounds.
[200]	validation_0-mae:0.228388	validation_0-auc:0.750121	validation_1-mae:0.228342	validation_1-auc:0.726816
[400]	validation_0-mae:0.189752	validation_0-auc:0.763238	validation_1-mae:0.190245	validation_1-auc:0.734177
[600]	validatio

[1400]	validation_0-mae:0.176299	validation_0-auc:0.793877	validation_1-mae:0.180616	validation_1-auc:0.751086
[1600]	validation_0-mae:0.175543	validation_0-auc:0.798344	validation_1-mae:0.180253	validation_1-auc:0.751179
[1800]	validation_0-mae:0.174842	validation_0-auc:0.802737	validation_1-mae:0.17995	validation_1-auc:0.751108
[2000]	validation_0-mae:0.174071	validation_0-auc:0.806951	validation_1-mae:0.179553	validation_1-auc:0.751119
[2200]	validation_0-mae:0.173432	validation_0-auc:0.811275	validation_1-mae:0.179302	validation_1-auc:0.751184
[2400]	validation_0-mae:0.172797	validation_0-auc:0.815204	validation_1-mae:0.179027	validation_1-auc:0.751066
Stopping. Best iteration:
[2051]	validation_0-mae:0.173899	validation_0-auc:0.808089	validation_1-mae:0.179472	validation_1-auc:0.751224

      fold  test auc  test mae  validation auc  validation mae
0        0  0.753623  0.156608        0.753465        0.158393
1        1  0.753727  0.155767        0.753058        0.157511
2       