In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# modeling
import lightgbm as lgb

# utilities
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# memory management
import gc

# Calling Datasets

In [None]:
#Datasets
application_train = pd.read_csv('/content/drive/MyDrive/Thesis/Home Credit Section/Final [Small]/predictor_train_small.csv')
application_test = pd.read_csv('/content/drive/MyDrive/Thesis/Home Credit Section/Final [Small]/predictor_test_small.csv')

In [None]:
data_len=len(application_train)

lr_train=application_train.iloc[data_len//3:]
lr_test=application_train.iloc[:data_len//3]

lr_train=lr_train.reset_index(drop=True)
lr_test=lr_test.reset_index(drop=True)

# LightGBM

In [None]:
def model(features, test_features, encoding = 'ohe', n_folds = 5):

    """Train and test a light gradient boosting model using
    cross validation.

    Parameters
    --------
        features (pd.DataFrame):
            dataframe of training features to use
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame):
            dataframe of testing features to use
            for making predictions with the model.
        encoding (str, default = 'ohe'):
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation

    Return
    --------
        submission (pd.DataFrame):
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame):
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame):
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.

    """
    cat_indices=application_train.columns

    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']

    # Extract the labels for training
    labels = features['TARGET']

    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])

    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)

    # Extract feature names
    feature_names = list(features.columns)

    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)

    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])

    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):

        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]

        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', boosting_type='goss',
                                   class_weight = 'balanced', learning_rate = 0.05,
                                   reg_alpha = 0.1, reg_lambda = 0.1, n_jobs = -1, random_state = 50)

        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'],
                  early_stopping_rounds = 100, verbose = 200)

        # Record the best iteration
        best_iteration = model.best_iteration_

        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits

        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits

        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]

        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']

        valid_scores.append(valid_score)
        train_scores.append(train_score)

        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()

    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})

    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})

    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)

    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')

    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})

    return submission, feature_importances, metrics

In [None]:
#train['TARGET'] = train_labels
#train['SK_ID_CURR'] = train_ids
#test['SK_ID_CURR'] = test_ids

train_labels=lr_train['TARGET']
test_labels=lr_test['TARGET']
lr_test = lr_test.drop(columns = ['TARGET'])
submission, feature_importances, metrics = model(lr_train, lr_test)

Training Data Shape:  (205008, 338)
Testing Data Shape:  (102503, 338)
[200]	train's auc: 0.848179	train's binary_logloss: 0.496498	valid's auc: 0.776271	valid's binary_logloss: 0.528403
[400]	train's auc: 0.895638	train's binary_logloss: 0.440131	valid's auc: 0.776949	valid's binary_logloss: 0.49224
[200]	train's auc: 0.848645	train's binary_logloss: 0.495863	valid's auc: 0.774436	valid's binary_logloss: 0.526363
[400]	train's auc: 0.89494	train's binary_logloss: 0.44011	valid's auc: 0.774901	valid's binary_logloss: 0.491227
[200]	train's auc: 0.846548	train's binary_logloss: 0.498808	valid's auc: 0.784813	valid's binary_logloss: 0.525422
[400]	train's auc: 0.893577	train's binary_logloss: 0.443385	valid's auc: 0.785853	valid's binary_logloss: 0.489753
[200]	train's auc: 0.847779	train's binary_logloss: 0.497153	valid's auc: 0.777091	valid's binary_logloss: 0.526386
[200]	train's auc: 0.846693	train's binary_logloss: 0.498668	valid's auc: 0.784934	valid's binary_logloss: 0.525806


In [None]:
metrics

Unnamed: 0,fold,train,valid
0,0,0.8789,0.777641
1,1,0.885131,0.775308
2,2,0.887703,0.786041
3,3,0.869456,0.778308
4,4,0.86996,0.785511
5,overall,0.87823,0.780515
