In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# modeling 
import lightgbm as lgb

# utilities
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns
# memory management
import gc

In [None]:
train_application = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
train_bureau = pd.read_csv('../input/homecreditprocessed/train_bureau2.csv')

#train_previous = pd.read_csv('../input/homecreditprocessed/train_previous_processed_2.csv')

test_application_bureau = pd.read_csv('../input/homecreditprocessed/test_bureau.csv')
#test_application_previous = pd.read_csv('../input/homecreditprocessed/test_previous.csv')

In [None]:
#train_application_previous = train_application.merge(train_previous, on = 'SK_ID_CURR', how = 'left')

In [None]:
train_application_previous_label = train_application_previous['TARGET']
train_application_previous = train_application_previous.drop(['TARGET'], axis=1)

In [None]:
#bureau_columns = list(train_bureau.columns)
#application_columns = list(train_application.columns)
#original_features = list(set(bureau_columns) & set(application_columns))
#print(original_features)

# Remove Collinear Variables in previous_application

In [None]:
# Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = train_application_previous.corr().abs()
corr_matrix.head()

# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(to_drop)))

train_application_previous = train_application_previous.drop(columns = to_drop)
test_application_previous = test_application_previous.drop(columns = to_drop)

print('Training shape: ', train_application_previous.shape)
print('Testing shape: ', test_application_previous.shape)


train_application_previous.to_csv('train_application_previous_dropped_collinearity.csv' , index = False)
test_application_previous.to_csv('test_application_previous.csv', index = False)

# Bureau remove collinearity

In [None]:
train_application_bureau = train_application.merge(train_bureau, on='SK_ID_CURR', how = 'left')
train_application_bureau_label = train_application_bureau['TARGET']
train_application_bureau = train_application_bureau.drop(['TARGET'],axis=1)

In [None]:
train_application_bureau.head()

In [None]:
test_application_bureau.head()

In [None]:
#train_application_bureau = pd.get_dummies(train_application_bureau)
#train_application_bureau.head()

In [None]:
# Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = train_application_bureau.corr().abs()
corr_matrix.head()

In [None]:
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

In [None]:
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(to_drop)))

In [None]:
train_application_bureau_dropped = train_application_bureau.drop(columns = to_drop)
test_application_bureau_dropped = test_application_bureau.drop(columns = to_drop)

In [None]:
train_application_bureau_dropped.head()

In [None]:
test_application_bureau_dropped.head()

In [None]:
train_application_bureau_dropped.to_csv('train_application_bureau_dropped_collinear.csv',index=False)
test_application_bureau_dropped.to_csv('test_application_bureau_dropped_collinear.csv',index=False)

In [None]:
train_application_bureau_dropped = train_application_bureau_dropped.drop(columns = ['SK_ID_CURR'])
test_application_bureau_dropped = test_application_bureau_dropped.drop(columns = ['SK_ID_CURR'])

In [None]:
train_application_bureau_dropped

In [None]:
train = pd.get_dummies(train_application_bureau_dropped)
test = pd.get_dummies(test_application_bureau_dropped)

In [None]:
train.head()

In [None]:
# Match the columns in the dataframes
train, test = train.align(test, join = 'inner', axis = 1)
print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

## feature importance

In [None]:
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(train.shape[1])

# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [None]:
import re
train_re = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test_re = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
test_re

In [None]:
# Fit the model twice to avoid overfitting
for i in range(2):
    
    # Split into training and validation set
    train_features, valid_features, train_y, valid_y = train_test_split(train_re, train_application_bureau_label, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    feature_importances += model.feature_importances_

In [None]:
# Make sure to average feature importances! 
feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': list(train_re.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

In [None]:
feature_importances

In [None]:
# Find the features with zero importance
zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
print('There are %d features with 0.0 importance' % len(zero_features))
feature_importances.tail()

In [None]:
train_re = train_re.drop(columns = zero_features)
test_re = test_re.drop(columns = zero_features)


In [None]:
print('Training shape: ', train_re.shape)
print('Testing shape: ', test_re.shape)

In [None]:
train_re.to_csv('train_application_bureau_drop_0importance.csv',index=False)
test_re.to_csv('test_application_bureau_drop_0importance.csv',index=False)

In [None]:
def plot_feature_importances(df, threshold = 0.9):
    """
    Plots 15 most important features and the cumulative importance of features.
    Prints the number of features needed to reach threshold cumulative importance.
    
    Parameters
    --------
    df : dataframe
        Dataframe of feature importances. Columns must be feature and importance
    threshold : float, default = 0.9
        Threshold for prining information about cumulative importances
        
    Return
    --------
    df : dataframe
        Dataframe ordered by feature importances with a normalized column (sums to 1)
        and a cumulative importance column
    
    """
    
    plt.rcParams['font.size'] = 18
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    # Cumulative importance plot
    plt.figure(figsize = (8, 6))
    plt.plot(list(range(len(df))), df['cumulative_importance'], 'r-')
    plt.xlabel('Number of Features'); plt.ylabel('Cumulative Importance'); 
    plt.title('Cumulative Feature Importance');
    plt.show();
    
    importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
    print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
    
    return df

In [None]:
norm_feature_importances = plot_feature_importances(feature_importances)

In [None]:
def model(features, test_features, encoding = 'ohe', n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [None]:
train_re['TARGET'] = train_application_bureau_label
train_re['SK_ID_CURR'] = train_application_bureau['SK_ID_CURR']
test_re['SK_ID_CURR'] = test_application_bureau['SK_ID_CURR']

In [None]:
submission, feature_importances, metrics = model(train_re, test_re)

In [None]:
submission.to_csv('application_bureau_submission.csv', index = False)
#public score: 0.75644

In [None]:
norm_feature_importances = plot_feature_importances(feature_importances)

# Comparison

In [None]:
train['TARGET'] = train_application_bureau_label
train['SK_ID_CURR'] = train_application_bureau['SK_ID_CURR']
test['SK_ID_CURR'] = test_application_bureau['SK_ID_CURR']

In [None]:
submission2, feature_importances2, metrics = model(train, test)

In [None]:
submission2.to_csv('application_bureau_submission2.csv', index = False)
#0.75709

In [None]:
feature_importances2

In [None]:
# Find the features with zero importance
zero_features = list(feature_importances2[feature_importances2['importance'] == 0.0]['feature'])
print('There are %d features with 0.0 importance' % len(zero_features))
zero_features

In [None]:
train_2 = train.drop(columns = zero_features)
test_2 = test.drop(columns = zero_features)

In [None]:
submission3, feature_importances3, metrics = model(train_2, test_2)

In [None]:
submission3.to_csv('application_bureau_submission3.csv', index = False)
#0.75717