## Home Credit Default Risk - Team 3 (Kahsai, Nichols, Pellerito)

### Import packages

In [None]:
# standard Python tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# special tools for working in Kaggle
import joblib   # save and load ML models
import gc       # garbage collection
import os 
import sklearn

# preprocessing steps
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# machine learning models and tools
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from xgboost import plot_importance

# cross validation and metrics - remember this competition is scored as area under curve
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

# surely there will be a lot more packages loaded by the time we are done!

In [None]:
# Don't do this!
import warnings
warnings.filterwarnings("ignore")

# First look at training data set

### Read the training data

In [None]:
MainDir = "../input/../input/home-credit-default-risk"
print(os.listdir(MainDir))

# Main table
train = pd.read_csv(f'{MainDir}/application_train.csv')

train.head(5)

### data cleansing and feature engineering: create new features based on ratios, logs, etc.

In [None]:
# what is going on with days_employed? Over 50,000 entries have the value 365,243 days! Let's replace those with NaN and let the imputer deal with them.
train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# ratio features
train['CI_ratio'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']        # credit-to-income ratio
train['AI_ratio'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']       # annuity-to-income ratio
train['AC_ratio'] = train['AMT_CREDIT'] / train['AMT_ANNUITY']             # credit to annuity - basically the term of the loan in years
train['CG_ratio'] = train['AMT_CREDIT'] / train['AMT_GOODS_PRICE']         # credit to goods price ratio - how much was financed?

# log features
train['log_INCOME'] = np.log(train['AMT_INCOME_TOTAL'])                    # log of income
train['log_ANNUITY'] = np.log(train['AMT_ANNUITY'])                        # log of annuity
train['log_CREDIT'] = np.log(train['AMT_CREDIT'])                          # log of credit
train['log_GOODS'] = np.log(train['AMT_GOODS_PRICE'])                      # log of goods price

# flag features
train['MissingBureau'] = train.iloc[:, 41:44].isnull().sum(axis=1).astype("category")   # number of bureaus with no score
train['FLAG_CG_ratio'] = train['AMT_CREDIT'] > train['AMT_GOODS_PRICE']                 # FLAG if you borrowed more than the price of the item

# EXT_SOURCE_x variables are very important - let's not leave missing values up to the imputer!
# Instead of imputing missing values by column mean or median, let's fill in missing values by row
# i.e. missing scores are replaced with the average of the scores we do have. If there are no scores at all
# let's just give them a value of 0.2 for now.
train['AVG_EXT'] = train.iloc[:, 41:44].sum(axis=1)/(3- train.iloc[:,41:44].isnull().sum(axis=1))   # average of the (at most) three scores
train['AVG_EXT'].replace(np.nan, 0.2, inplace = True)   # get rid of any /0 errors generated from previous step

train.EXT_SOURCE_1.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_2.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_3.fillna(train.AVG_EXT, inplace=True)

train.drop(['AVG_EXT'], axis = 1)   # let's not make AVG_EXT a feature - it will be too highly correlated to the three components

# drop these variables based on poor feature significance (< 0.0001)
train.drop(['REG_REGION_NOT_LIVE_REGION','AMT_REQ_CREDIT_BUREAU_WEEK','HOUSETYPE_MODE','OCCUPATION_TYPE','FLAG_MOBIL','FLAG_CONT_MOBILE',
           'NAME_TYPE_SUITE', 'FLAG_DOCUMENT_4','ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_16',
           'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11','FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'AMT_REQ_CREDIT_BUREAU_DAY',
           'AMT_REQ_CREDIT_BUREAU_HOUR', 'FLAG_DOCUMENT_21','FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_17','FLAG_DOCUMENT_2'],
           axis=1, inplace=True)

### split out our training data - start with about 10% or 30k out of 300k

In [None]:
y = train['TARGET'].values
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(['TARGET', 'SK_ID_CURR'], axis = 1), y, stratify = y, test_size=0.9, random_state=1)
print('Shape of X_train:',X_train.shape)
print('Shape of y_train:',y_train.shape)
print('Shape of X_valid:',X_valid.shape)
print('Shape of y_valid:',y_valid.shape)

### make lists of cat and num features for pipeline, based on dtype

In [None]:
types = np.array([z for z in X_train.dtypes])        # array([dtype('float64'), dtype('float64'), dtype('O'), dtype('O') ...])
all_columns = X_train.columns.values                 # list of all column names
is_num = types != 'object'                           # returns array([False, False, False, False,  True,  True, ...) where True is a numeric variable
num_features = all_columns[is_num].tolist()          # list of all numeric columns
cat_features = all_columns[~is_num].tolist()         # list of all categorical columns

print(len(num_features), "numeric features")
print(len(cat_features), "categorical features")

### build model pipeline based on num_cols and cat_cols lists

In [None]:
features = num_features + cat_features

Pipe_num = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy = 'median')),        # tried median, mean, constant strategies
    ('scaler', StandardScaler())       ])

Pipe_cat = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'Unknown')),
    ('onehot', OneHotEncoder())        ])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', Pipe_num, num_features),
        ('cat', Pipe_cat, cat_features)])

preprocessor.fit(train[features])
X_train = preprocessor.transform(X_train[features])
X_valid = preprocessor.transform(X_valid[features])

print('Shape of X_train:',X_train.shape)
print('Shape of y_train:',y_train.shape)

### Build Model Scoreboard

In [None]:
# set up table - new rows will be appended as models are run

pd.set_option('display.max_colwidth', None)             # LGBM in particular has long hyperparameters and I want to see them all
results = pd.DataFrame(columns = ['Model Type','AUC - 10xv', 'AUC - Valid', 'Hyperparameters'])

# Models

### Logistic Regression

In [None]:
%%time
# This model is running slow as a dog right now - might need to run it overnight
lr_clf = LogisticRegression(max_iter=2000, solver='saga', penalty = 'elasticnet')
lr_parameters = {'l1_ratio':[1], 'C': [1]}
lr_grid = GridSearchCV(lr_clf, lr_parameters, cv=10, refit='True', n_jobs=-1, verbose=1, scoring='roc_auc')
lr_grid.fit(X_train, y_train)

lr_model = lr_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Logistic Regression',
                          'AUC - 10xv' : lr_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, lr_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : lr_grid.best_params_},
                        ignore_index=True)
results

### Random Forest

In [None]:
%%time
rf_clf = RandomForestClassifier(random_state=1, n_estimators=100)
rf_parameters = {'max_depth': [28, 30, 32],  'min_samples_leaf': [30, 32, 34, 36]}
rf_grid = GridSearchCV(rf_clf, rf_parameters, cv=10, refit='True', n_jobs=-1, verbose=1, scoring='roc_auc')
rf_grid.fit(X_train, y_train)
rf_model = rf_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Random Forest',
                          'AUC - 10xv' : rf_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, rf_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : rf_grid.best_params_},
                        ignore_index=True)
results

### Decision Tree

In [None]:
%%time
dt_clf = DecisionTreeClassifier(random_state=1)
dt_parameters = {
    'max_depth': [4, 8, 12, 16, 20, 24],
    'min_samples_leaf': [2, 4, 6, 8]
}

dt_grid = GridSearchCV(dt_clf, dt_parameters, cv=10, refit='True', n_jobs=-1, verbose=0, scoring='roc_auc')
dt_grid.fit(X_train, y_train)

dt_model = dt_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Decision Tree',
                          'AUC - 10xv' : dt_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, dt_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : dt_grid.best_params_},
                        ignore_index=True)
results

### Light Gradient Boost Machine

In [None]:
%%time
XGB_clf = XGBClassifier(objective='binary:logistic', use_label_encoder=False)
XGB_parameters = {
    'max_depth': range(1, 3, 5),
    'n_estimators': range(10, 50, 100),
    'learning_rate': [0.25, 0.5, 0.75, 1, 1.25, 1.5]
}

XGB_grid = GridSearchCV(XGB_clf, XGB_parameters, cv=10, n_jobs=10, verbose=True, scoring= 'roc_auc')
XGB_grid.fit(X_train, y_train)

XGB_model = XGB_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Light GBM',
                          'AUC - 10xv' : XGB_grid.best_score_,
                          'AUC - Valid' : roc_auc_score(y_valid, XGB_model.predict_proba(X_valid)[:, 1]),
                          'Hyperparameters' : XGB_grid.best_params_},
                        ignore_index=True)
results

### Plotting the ROC curve (for AUC score on validation data)

In [None]:
probabilities = rf_model.predict_proba(X_valid)[:,1]
fpr, tpr, thresholds = roc_curve(y_valid, probabilities)
auc = roc_auc_score(y_valid, probabilities)               # AUC on validation data was .7403 per table above
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr)                                        # plot the blue curve
plt.plot([0, 1], [0, 1])                                  # plot the orange 45 degree line
plt.title('Receiver operating characteristic curve')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(["AUC = %.6f"%auc])
plt.show()

# hat tip:
# https://medium.com/@praveenkotha/home-credit-default-risk-end-to-end-machine-learning-project-1871f52e3ef2
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

### Feature Importance (random forest)

In [None]:
importance_DF = pd.DataFrame(zip(rf_model.feature_importances_, features), columns=['Value','Feature']).sort_values(by="Value", ascending=False)
importance_plot = importance_DF[importance_DF['Value'] > 0.01]
plt.figure(figsize=[10,6])
sns.barplot(importance_plot['Value'], importance_plot['Feature'], orient = "h", color = "lightsteelblue")
plt.title("Most important features (min 1% of total)")
plt.show()

### Factors with little explanatory power - consider dropping these

In [None]:
# Consider dropping these
drop_list = importance_DF[importance_DF['Value'] < 1E-4]['Feature'].to_list()
drop_list

### Boxplot on validation set

In [None]:
from sklearn.model_selection import cross_val_score
final_model = RandomForestClassifier(random_state=1, n_estimators=100, max_depth=24, min_samples_leaf=24)
scores = cross_val_score(rf_model, X_train, y_train, cv=10, scoring = 'roc_auc')
print(scores.round(2))
print(scores.mean())
print(scores.std(ddof=1))
plt.figure(figsize=[12,3])
sns.boxplot(scores, orient = "h")
plt.show()

### Creating separation between classes

In [None]:
boxdata = pd.DataFrame({'prediction' : lr_model.predict_proba(X_valid)[:,1], 'target' : y_valid})
plt.figure(figsize=[12,3])
sns.boxplot(boxdata.prediction, boxdata.target, orient = "h")
plt.show()

### Final Model Selection - save data

In [None]:
final_model = LogisticRegression(max_iter=2000, solver='saga', penalty = 'elasticnet', C = 1, l1_ratio = 1)
final_model.fit(X_train, y_train)

joblib.dump(preprocessor, 'default_preprocessor_06.joblib') 
joblib.dump(final_model, 'default_model_06.joblib')