In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, auc, roc_curve, precision_recall_curve
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
application_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
application_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

In [None]:
vals = application_train['TARGET'].value_counts().values
inds = application_train['TARGET'].value_counts().index

plt.figure(figsize = (16, 8))
plt.pie(x=vals, autopct="%.2f%%", labels = inds, colors = ['#c2c2f0','#ffcc99'], pctdistance = 0.85, startangle = 180)
plt.title('Loan repayment', fontdict = {'fontsize': 20})

centre_circle = plt.Circle((0,0), 0.7, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.show()

# Data Preprocessing and Feature Engineering

In [None]:
obj_cols = [application_train.columns[i] for i in range(application_train.shape[1])\
            if application_train.dtypes[i] == 'object']
numeric_cols = [application_train.columns[i] for i in range(application_train.shape[1])\
                if (application_train.dtypes[i] == 'float64') | (application_train.dtypes[i] == 'int64')]
print(numeric_cols)

In [None]:
for i in range(len(application_train.columns)):
    print(str('"') + application_train.columns[i] + str('",'))

In [None]:
application_train[["FONDKAPREMONT_MODE",
"HOUSETYPE_MODE",
"WALLSMATERIAL_MODE",
"EMERGENCYSTATE_MODE"]].describe()

Categorial = "NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "CNT_CHILDREN", "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "FLAG_MOBIL", "FLAG_EMP_PHONE", "FLAG_WORK_PHONE", "FLAG_CONT_MOBILE", "FLAG_PHONE", "FLAG_EMAIL", "OCCUPATION_TYPE", "CNT_FAM_MEMBERS", "REGION_RATING_CLIENT", "REGION_RATING_CLIENT_W_CITY", "WEEKDAY_APPR_PROCESS_START", "HOUR_APPR_PROCESS_START", "REG_REGION_NOT_LIVE_REGION", "REG_REGION_NOT_WORK_REGION", "LIVE_REGION_NOT_WORK_REGION", "REG_CITY_NOT_LIVE_CITY", "REG_CITY_NOT_WORK_CITY", "LIVE_CITY_NOT_WORK_CITY", "ORGANIZATION_TYPE", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE", "EMERGENCYSTATE_MODE"

In [None]:
categorical_cols = ["NAME_CONTRACT_TYPE",
"CODE_GENDER",
"FLAG_OWN_CAR",
"FLAG_OWN_REALTY",
"CNT_CHILDREN",
"NAME_TYPE_SUITE",
"NAME_INCOME_TYPE",
"NAME_EDUCATION_TYPE",
"NAME_FAMILY_STATUS",
"NAME_HOUSING_TYPE",
"FLAG_MOBIL",
"FLAG_EMP_PHONE",
"FLAG_WORK_PHONE",
"FLAG_CONT_MOBILE",
"FLAG_PHONE",
"FLAG_EMAIL",
"OCCUPATION_TYPE",
"CNT_FAM_MEMBERS",
"REGION_RATING_CLIENT",
"REGION_RATING_CLIENT_W_CITY",
"WEEKDAY_APPR_PROCESS_START",
"HOUR_APPR_PROCESS_START",
"REG_REGION_NOT_LIVE_REGION",
"REG_REGION_NOT_WORK_REGION",
"LIVE_REGION_NOT_WORK_REGION",
"REG_CITY_NOT_LIVE_CITY",
"REG_CITY_NOT_WORK_CITY",
"LIVE_CITY_NOT_WORK_CITY",
"ORGANIZATION_TYPE",
"FONDKAPREMONT_MODE",
"HOUSETYPE_MODE",
"WALLSMATERIAL_MODE",
"EMERGENCYSTATE_MODE"]
len(categorical_cols)

In [None]:
X = application_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y = application_train['TARGET']
numeric_cols = list(set(X.columns.values.tolist()) - set(categorical_cols))

In [None]:
X_real_zeros = X[numeric_cols].fillna(0.0)
X_real_mean = X[numeric_cols].fillna(X[numeric_cols].mean())

X_cat = X[categorical_cols].fillna('NA')
X_cat = X_cat.astype('str')
X_cat.head()

In [None]:
encoder = DV(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())

In [None]:
X_cat_oh.shape

In [None]:
(X_train_real_zeros, 
 X_test_real_zeros, 
 y_train, y_test) = train_test_split(X_real_zeros, y, 
                                     test_size=0.3, 
                                     random_state=0)
(X_train_real_mean, 
 X_test_real_mean) = train_test_split(X_real_mean, 
                                      test_size=0.3, 
                                      random_state=0)
(X_train_cat_oh,
 X_test_cat_oh) = train_test_split(X_cat_oh, 
                                   test_size=0.3, 
                                   random_state=0)

# Scaling

In [None]:
scaler = StandardScaler()

X_train_real_scaled = scaler.fit_transform(X_train_real_zeros)
X_test_real_scaled = scaler.transform(X_test_real_zeros)

X_train_mean_scaled = scaler.fit_transform(X_train_real_mean)
X_test_mean_scaled = scaler.transform(X_test_real_mean)

X_train_zeros_cat = np.concatenate((X_train_real_scaled, X_train_cat_oh), axis = 1)
X_test_zeros_cat = np.concatenate((X_test_real_scaled, X_test_cat_oh), axis = 1)

X_train_mean_cat = np.concatenate((X_train_mean_scaled, X_train_cat_oh), axis = 1)
X_test_mean_cat = np.concatenate((X_test_mean_scaled, X_test_cat_oh), axis = 1)

# Model Selection

In [None]:
%%time
estimator_2_1 = XGBClassifier()
#print('Cross val score =', cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, cv = 5).mean())

#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#scores = cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
#print('Mean ROC AUC: %.5f' % np.mean(scores))

estimator_2_1.fit(X_train_zeros_cat, y_train)

predicted_2_1 = estimator_2_1.predict(X_test_zeros_cat)

accur_2_1 = accuracy_score(y_test, predicted_2_1.round())

print("Accuracy percent =", "%.2f%%" % (accur_2_1*100))
print('ROC AUC =', roc_auc_score(predicted_2_1, y_test))

predicted_2_1_2 = estimator_2_1.predict_proba(X_test_zeros_cat)[:, 1]
fp_rate, tp_rate, thresholds = roc_curve(y_test, predicted_2_1_2)
print('ROC-AUC Score for probabilities =', auc(fp_rate, tp_rate))

print(confusion_matrix(y_test, predicted_2_1.round()))
print(classification_report(y_test, predicted_2_1.round()))

## With mean replaced for missing values

In [None]:
%%time
estimator_2_1_ = XGBClassifier()
#print('Cross val score =', cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, cv = 5).mean())

#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#scores = cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
#print('Mean ROC AUC: %.5f' % np.mean(scores))

estimator_2_1_.fit(X_train_mean_cat, y_train)


predicted_2_1_ = estimator_2_1_.predict(X_test_mean_cat)

accur_2_1_ = accuracy_score(y_test, predicted_2_1_.round())

print("Accuracy percent =", "%.2f%%" % (accur_2_1_*100))
print('ROC AUC =', roc_auc_score(predicted_2_1_, y_test))

predicted_2_1_2_ = estimator_2_1_.predict_proba(X_test_mean_cat)[:, 1]
fp_rate, tp_rate, thresholds = roc_curve(y_test, predicted_2_1_2_)
print('ROC-AUC Score for probabilities =', auc(fp_rate, tp_rate))

print(confusion_matrix(y_test, predicted_2_1_.round()))
print(classification_report(y_test, predicted_2_1_.round()))

## f1-score is 0.05 and recall is 0.03 for class 1, so it assumes that almost all loans will be repayed

In [None]:
%%time
estimator_3 = RandomForestClassifier()
#print('Cross val score =', cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, cv = 5).mean())

#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#scores = cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
#print('Mean ROC AUC: %.5f' % np.mean(scores))

estimator_3.fit(X_train_zeros_cat, y_train)


predicted_3 = estimator_3.predict(X_test_zeros_cat)

accur_3 = accuracy_score(y_test, predicted_3.round())

print("Accuracy percent =", "%.2f%%" % (accur_3*100))
print('ROC AUC =', roc_auc_score(predicted_3, y_test))

predicted_3_2 = estimator_3.predict_proba(X_test_mean_cat)[:, 1]
fp_rate, tp_rate, thresholds = roc_curve(y_test, predicted_3_2)
print('ROC-AUC Score for probabilities =', auc(fp_rate, tp_rate))

print(confusion_matrix(y_test, predicted_3.round()))
print(classification_report(y_test, predicted_3.round()))

## With mean replaced for missing values

In [None]:
%%time
estimator_3_ = RandomForestClassifier()
#print('Cross val score =', cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, cv = 5).mean())

#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#scores = cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
#print('Mean ROC AUC: %.5f' % np.mean(scores))

estimator_3_.fit(X_train_mean_cat, y_train)


predicted_3_ = estimator_3_.predict(X_test_mean_cat)

accur_3_ = accuracy_score(y_test, predicted_3_.round())

print("Accuracy percent =", "%.2f%%" % (accur_3_*100))
print('ROC AUC =', roc_auc_score(predicted_3_, y_test))

predicted_3_2_ = estimator_3_.predict_proba(X_test_mean_cat)[:, 1]
fp_rate, tp_rate, thresholds = roc_curve(y_test, predicted_3_2_)
print('ROC-AUC Score for probabilities =', auc(fp_rate, tp_rate))

print(confusion_matrix(y_test, predicted_3_.round()))
print(classification_report(y_test, predicted_3_.round()))

## f1-score is 0.00 and recall is 0.00 for class 1, so it assumes that ALL loans will be repayed. And there is no any significant difference between replacing the missing values with zero and mean. Hence, the replacing with zero will be selected

In [None]:
%%time
estimator_4 = LGBMClassifier()
#print('Cross val score =', cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, cv = 5).mean())

#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#scores = cross_val_score(estimator_2_1, X_train_zeros_cat, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
#print('Mean ROC AUC: %.5f' % np.mean(scores))

estimator_4.fit(X_train_zeros_cat, y_train)

predicted_4 = estimator_4.predict(X_test_zeros_cat)

accur_4 = accuracy_score(y_test, predicted_4.round())

print("Accuracy percent =", "%.2f%%" % (accur_4*100))
print('ROC AUC =', roc_auc_score(predicted_4, y_test))

predicted_4_2 = estimator_4.predict_proba(X_test_mean_cat)[:, 1]
fp_rate, tp_rate, thresholds = roc_curve(y_test, predicted_4_2)
print('ROC-AUC Score for probabilities =', auc(fp_rate, tp_rate))

print(confusion_matrix(y_test, predicted_4.round()))
print(classification_report(y_test, predicted_4.round()))

# Applying oversampling and undersampling as the target class is unbalanced. The fractions for oversampling and undersampling were selected by trial and error

In [None]:
model_lgbm = LGBMClassifier()
model_xgboost = XGBClassifier()
model_random_forest = RandomForestClassifier()
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under), ('model', model_lgbm)]
pipeline_lgbm = Pipeline(steps=steps)

steps_xgboost = [('over', over), ('under', under), ('model', model_xgboost)]
pipeline_xgboost = Pipeline(steps=steps_xgboost)

steps_random_forest = [('over', over), ('under', under), ('model', model_random_forest)]
pipeline_random_forest = Pipeline(steps=steps_random_forest)

In [None]:
%%time
pipeline_lgbm.fit(X_train_zeros_cat, y_train)

predicted_5_2 = pipeline_lgbm.predict(X_test_zeros_cat)

accur_5_2 = accuracy_score(y_test, predicted_5_2.round())

print("Accuracy percent =", "%.2f%%" % (accur_5_2*100))
print('ROC AUC =', roc_auc_score(predicted_5_2, y_test))

predicted_5_2_ = pipeline_lgbm.predict_proba(X_test_mean_cat)[:, 1]
fp_rate, tp_rate, thresholds = roc_curve(y_test, predicted_5_2_)
print('ROC-AUC Score for probabilities =', auc(fp_rate, tp_rate))

print(confusion_matrix(y_test, predicted_5_2.round()))
print(classification_report(y_test, predicted_5_2.round()))

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

#### Reference for Hyperparameter tuning: https://www.kaggle.com/mlisovyi/lightgbm-hyperparameter-optimisation-lb-0-761

In [None]:
%%time
pipeline_lgbm_s = RandomizedSearchCV(
    estimator = model_lgbm, param_distributions = param_test, 
    n_iter = 100,
    scoring = 'roc_auc',
    cv = 3,
    refit = True,
    random_state = 123,
    verbose = True)

pipeline_lgbm_s.fit(X_train_zeros_cat, y_train)

predicted_5_2 = pipeline_lgbm_s.predict(X_test_zeros_cat)

accur_5_2 = accuracy_score(y_test, predicted_5_2.round())

print("Accuracy percent =", "%.2f%%" % (accur_5_2*100))
print('ROC AUC =', roc_auc_score(predicted_5_2, y_test))

predicted_5_2_ = pipeline_lgbm_s.predict_proba(X_test_mean_cat)[:, 1]
fp_rate, tp_rate, thresholds = roc_curve(y_test, predicted_5_2_)
print('ROC-AUC Score for probabilities =', auc(fp_rate, tp_rate))

print(confusion_matrix(y_test, predicted_5_2.round()))
print(classification_report(y_test, predicted_5_2.round()))

## The one of the best performances LightGBM with hyperparameters tuning without over/under-sampling, with 0.70845 ROC-AUC score on leaderboard

In [None]:
%%time
pipeline_xgboost.fit(X_train_zeros_cat, y_train)

predicted_6 = pipeline_xgboost.predict(X_test_zeros_cat)

accur_6 = accuracy_score(y_test, predicted_6.round())

print("Accuracy percent =", "%.2f%%" % (accur_6*100))
print('ROC AUC =', roc_auc_score(predicted_6, y_test))

predicted_6_ = pipeline_xgboost.predict_proba(X_test_mean_cat)[:, 1]
fp_rate, tp_rate, thresholds = roc_curve(y_test, predicted_6_)
print('ROC-AUC Score for probabilities =', auc(fp_rate, tp_rate))

print(confusion_matrix(y_test, predicted_6.round()))
print(classification_report(y_test, predicted_6.round()))

In [None]:
%%time
pipeline_random_forest.fit(X_train_zeros_cat, y_train)

predicted_7 = pipeline_random_forest.predict(X_test_zeros_cat)

accur_7 = accuracy_score(y_test, predicted_7.round())

print("Accuracy percent =", "%.2f%%" % (accur_7*100))
print('ROC AUC =', roc_auc_score(predicted_7, y_test))

predicted_7_ = pipeline_random_forest.predict_proba(X_test_mean_cat)[:, 1]
fp_rate, tp_rate, thresholds = roc_curve(y_test, predicted_7_)
print('ROC-AUC Score for probabilities =', auc(fp_rate, tp_rate))

print(confusion_matrix(y_test, predicted_7.round()))
print(classification_report(y_test, predicted_7.round()))

In [None]:
X_app_test = application_test.drop(['SK_ID_CURR'], axis = 1)

X_real_zeros_app_test = X_app_test[numeric_cols].fillna(0.0)
X_real_mean_app_test = X_app_test[numeric_cols].fillna(X_app_test[numeric_cols].mean())

X_cat_app_test = X_app_test[categorical_cols].fillna('NA')
X_cat_app_test = X_cat_app_test.astype('str')
X_cat_app_test.head()

X_cat_oh_app_test = encoder.transform(X_cat_app_test.T.to_dict().values())

X_test_real_scaled_app_test = scaler.transform(X_real_zeros_app_test)

X_test_mean_scaled_app_test = scaler.transform(X_real_mean_app_test)

X_test_zeros_cat_app_test = np.concatenate((X_test_real_scaled_app_test, X_cat_oh_app_test), axis = 1)

X_test_mean_cat_app_test = np.concatenate((X_test_mean_scaled_app_test, X_cat_oh_app_test), axis = 1)

In [None]:
application_test['SK_ID_CURR']

In [None]:
predicted_app_test_lgbm_2 = pipeline_lgbm_2.predict_proba(X_test_zeros_cat_app_test)[:, 1]
submission_lgbm_2 = pd.DataFrame({'SK_ID_CURR': application_test['SK_ID_CURR'], 'TARGET': predicted_app_test_lgbm_2.astype('float64')})
submission_lgbm_2.set_index('SK_ID_CURR', inplace = True)
submission_lgbm_2.to_csv('submission.csv')

In [None]:
predicted_app_test_pipeline_lgbm_s = pipeline_lgbm_s.predict_proba(X_test_zeros_cat_app_test)[:, 1]
submission_pipeline_lgbm_s = pd.DataFrame({'SK_ID_CURR': application_test['SK_ID_CURR'], 'TARGET': predicted_app_test_pipeline_lgbm_s.astype('float64')})
submission_pipeline_lgbm_s.set_index('SK_ID_CURR', inplace = True)
submission_pipeline_lgbm_s.to_csv('submission.csv')