In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score

In [None]:
# Import dataset
df = pd.read_csv('../input/loan-default-prediction/train_v2.csv.zip')
df.head()

In [None]:
df.shape

In [None]:
# Check duplication in dataframe
df[df.duplicated()].shape

In [None]:
# The number of each data type in the dataframe
df.dtypes.value_counts()

In [None]:
# Loss Distribution
fig , ax = plt.subplots()
plt.hist(df['loss'], bins = 20, range=(0,100))
ax.set_ylim([0,3000])
plt.show()

**Handling with Missing Value**

In [None]:
# Calculate percent of missing in each row
df['num_missing'] = df.isnull().sum(axis = 1)/df.shape[1]

# Drop row that percent of missing more than 20%
missing_row = df[df['num_missing'] > 0.20].index
df.drop(df.index[missing_row], inplace = True)
df.shape

In [None]:
# Drop id and num_missing collumn
df.drop(columns = ['id','num_missing'], inplace = True)

In [None]:
# Calculate percent of missing in each column
col_pct_miss = []
for col in df.columns:
    percent_miss = np.mean(df[col].isnull())*100
    if percent_miss > 0:
        col_pct_miss.append([col, percent_miss])
    
col_pct_miss_df = pd.DataFrame(col_pct_miss, columns = ['column_name','% of Missing']).sort_values(by = '% of Missing', ascending = False)
col_pct_miss_df

In [None]:
# Impute missing value in numeric columns with median 
numeric_cols = df.select_dtypes(include=['number']).columns.values

for col in numeric_cols:
    if col in list(col_pct_miss_df.column_name) :
        med = df[col].median()
        df[col] = df[col].fillna(med)

In [None]:
# Impute missing value in categorical columns with mode
not_numeric_cols = df.select_dtypes(exclude=['number']).columns.values

for col in not_numeric_cols:
    if col in list(col_pct_miss_df.column_name):
        mode = df[col].mode()[0]
        df[col] = df[col].fillna(mode)

In [None]:
# Check missing value
df.isnull().sum().value_counts()

**Drop Unnecessary Columns**

In [None]:
# Drop Highly Corelated Columns

# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

df.drop(columns = to_drop, inplace = True)

In [None]:
#Drop Repetitive Columns
num_rows = df.shape[0]
rep_cols = []

for col in df.loc[:, df.columns != 'loss'].columns :
    cnts = df[col].value_counts()
    top_pct = (cnts/num_rows).iloc[0]
    
    if top_pct > 0.80:
        rep_cols.append([col,top_pct])
        
rep_col_df = pd.DataFrame(rep_cols, columns = ['column_name','% top repetitve value']).sort_values(by = '% top repetitve value', ascending = False).reset_index(drop=True)
rep_col_df

df.shape

**Encoding Category Columns**

As you can see, there are many attributes containing more that 20,000 categories.
So, I decided to drop them and convert remaining categories in each attribute to binary encoding form.

In [None]:
cat_cols = df.select_dtypes(exclude=['number']).columns.values

drop_cols = []
keep_cols = []
for col in cat_cols:
    if df[col].value_counts().count() > 20000 : 
        print('column {} has {} categories > drop'.format(col,df[col].value_counts().count()))
        drop_cols.append(col)
    else : 
        print('column {} has {} categories > keep'.format(col,df[col].value_counts().count()))
        keep_cols.append(col)

In [None]:
# Binary Encoding
import category_encoders as ce
encoder = ce.BinaryEncoder(cols = keep_cols)
bi_enc_df = encoder.fit_transform(df[keep_cols])
bi_col_name = bi_enc_df.columns
bi_enc_df.head()

#Add Binary Encding to dataframe and drop all categorical columns
df = pd.concat([df,bi_enc_df],axis = 1)
df.head()

In [None]:
# Add a 'loan_status' collumn which 1 represents default loan and 0 represents not default loan.
df['loan_status'] = np.where(df['loss'] > 0, 1, 0)
df.head()

In [None]:
# After generate a visualization from loan_status in dataframe. 
# We found that the data is imbalance.

ax = sns.countplot(x = 'loan_status', data=df)
plt.show()

df['loan_status'].value_counts()

**Undersampling**

To balance the loan_status in dataset, I apply undersampling technique to solve imbalanced dataset problem.

In [None]:
#Resampling Data
from sklearn.utils import resample

#Seperate each target class into 2 dataframes
not_default = df[df['loan_status'] == 0]
default = df[df['loan_status'] == 1]

#Resample dataframe
resample_df_d = resample(default,
                       replace = False,
                       n_samples = 9778,
                       random_state = 1234)

resample_df_n = resample(not_default,
                       replace = False,
                       n_samples = 9778,
                       random_state = 1234)

resample_df = pd.concat([resample_df_n, resample_df_d])

**Split Train Set & Test Set**

In [None]:
from sklearn.model_selection import train_test_split

X = resample_df.drop(columns = ['loss','loan_status'])
Y = resample_df['loss']

X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2, random_state = 1234, stratify = resample_df['loan_status'])
print('training set = {} records, test set= {} records'.format(X_train.shape[0],X_test.shape[0]))

**Feature Selection - Filter Method & RFE Method**

To increase model accuracy, I did feature selection using 2 techniques.
First, I have filtered the less important both numerical and non-numerical features by using filter method. Then I selected top 150 important numerical columns using recursive feature elimination.

In [None]:
from sklearn.feature_selection import SelectPercentile , SelectKBest, f_regression , f_classif

In [None]:
#Select top 170 important numerical columns with filter method
X_train_num = X_train.drop(columns = bi_col_name)

selector = SelectKBest(score_func = f_regression, k = 170)
selector.fit(X_train_num,Y_train)

select_cols = selector.get_support(indices = True)
select_num_cols = X_train_num.iloc[:,select_cols]

select_num_col_name = select_num_cols.columns
select_num_cols.head()

In [None]:
#Select top 150 important numerical columns with RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

selector = RFE(LogisticRegression(), n_features_to_select=150, step=1, verbose = 2)
selector = selector.fit(select_num_cols, Y_train)
select_cols = selector.get_support(indices = True)
select_cols_df = select_num_cols.iloc[:,select_cols]

best_X_col_name = select_cols_df.columns
select_cols_df.head()

In [None]:
# Select top 5 important categorical columns with filter method

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

X_train_cat = X_train.select_dtypes(exclude = 'number').copy()

# Create encoder
le = LabelEncoder()
X_train_cat = X_train_cat.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type='expand')

# Prepare input data 
oe = OrdinalEncoder()
oe.fit(X_train_cat)
X_train_cat_enc = oe.transform(X_train_cat)

selector = SelectKBest(score_func = f_classif , k=5)
selector.fit(X_train_cat_enc,Y_train)

select_cols = selector.get_support(indices = True)
select_cat_cols = X_train_cat.iloc[:,select_cols]

select_cat_col_name = select_cat_cols.columns
select_cat_cols.head()

In [None]:
#Combine categorical and non-categorical dataframe together
def filter_x_df(x):
    df = x.copy()
    all_filter_col = []
    
    for keep in select_cat_col_name[select_cat_col_name.isin(keep_cols)]:
        filter_col = [col for col in df.columns if col.startswith(str(keep))]
        for col in filter_col : 
            if col not in keep_cols:
                all_filter_col.append(col)
        
    drop_cat_df = df.drop(columns = cat_cols)
    
    new_df = pd.concat([drop_cat_df[best_X_col_name],drop_cat_df[all_filter_col]],axis = 1)
    return new_df

In [None]:
# Create filter_X_train and filter_X_test dataframe 
# Apply filter_x_df function to X_train and X_test

filter_X_train = filter_x_df(X_train)
filter_X_test = filter_x_df(X_test)

In [None]:
filter_X_train.head()

**Data Standardization**

In [None]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()  
scaler.fit(filter_X_train)

X_train_scal = scaler.fit_transform(filter_X_train)
X_test_scal = scaler.fit_transform(filter_X_test)

**K-nearest neighbors**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier()
neigh.fit(X_train_scal, Y_train)

Knn_y_pred_train =  neigh.predict(X_train_scal)
Knn_y_pred_test =  neigh.predict(X_test_scal)

scores_kn = cross_val_score(estimator = neigh, y = Y_train, X = X_train_scal, cv=5)
print('Cross Validation Score:', np.mean(scores_kn))

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression()
logisticRegr = logisticRegr.fit(X_train_scal, Y_train)

Lr_y_pred_train = logisticRegr.predict(X_train_scal)
Lr_y_pred_test = logisticRegr.predict(X_test_scal)

scores_lr = cross_val_score(estimator = logisticRegr, y = Y_train, X = X_train_scal, cv=5)
print('Cross Validation Score:', np.mean(scores_lr))

Random Foreset

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth = 70)
rf.fit(filter_X_train, Y_train)

rf_y_pred_train =  rf.predict(filter_X_train)
rf_y_pred_test =  rf.predict(filter_X_test)

scores_rf = cross_val_score(estimator = rf, y = Y_train, X = filter_X_train, cv=5)
print('Cross Validation Score:', np.mean(scores_rf))

**XGboost**

In [None]:
from  xgboost import XGBClassifier

xgb = XGBClassifier(gamma=0, learning_rate=0.1, max_depth=100, n_estimators=100)
xgb.fit(filter_X_train,Y_train)

xgb_y_pred_train = xgb.predict(filter_X_train)
xgb_y_pred_test = xgb.predict(filter_X_test)

scores_xg = cross_val_score(estimator = rf, y = Y_train, X = filter_X_train, cv=5)
print('Cross Validation Score:', np.mean(scores_xg))

# Summary

After I used k-fold cross validation to evaluate my models, I found that Random Forest gives the highest scores (0.5000) in predicting loan default, but it is not significantly different from XGboost and Logistic Regression scores.

To improve the model performance, I suggest training the Random Forest model with oversampling dataset using Synthetic Minority Oversampling Technique (SMOTE) which is an oversampling technique where the synthetic samples are generated for the minority class. I have done that with a Logistic Regression model before and it gave a better model accuracy score, but consumed high memory for processing.



In [None]:
# Importing SMOTE
from imblearn.over_sampling import SMOTE

# Oversampling the data
df_sm = df.drop(columns = cat_cols)
smote = SMOTE(random_state = 1234)
X_sm, Y_sm = smote.fit_resample(df_sm.drop(columns = 'loan_status'), df_sm['loan_status'])

print("After OverSampling, counts of label '1': {}".format(sum(Y_sm == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(Y_sm == 0)))

In [None]:
# Split Train Set & Test Set
from sklearn.model_selection import train_test_split

Y = X_sm['loss']
X = X_sm.drop(columns = 'loss')

X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2, random_state = 1234, stratify = Y_sm)
print('training set = {} records, test set= {} records'.format(X_train.shape[0],X_test.shape[0]))

In [None]:
# Select top 170 important numerical columns with filter method
X_train_num = X_train.drop(columns = bi_col_name)

selector = SelectKBest(score_func = f_regression, k = 170)
selector.fit(X_train_num,Y_train)

select_cols_sm = selector.get_support(indices = True)
select_num_cols_sm = X_train_num.iloc[:,select_cols_sm]

select_num_col_name = select_num_cols_sm.columns
select_num_cols_sm.head()

In [None]:
# Select top 150 important numerical columns with RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

selector = RFE(LogisticRegression(), n_features_to_select=150, step=1, verbose = 2)
selector = selector.fit(select_num_cols_sm, Y_train)
select_cols_sm = selector.get_support(indices = True)
select_cols_df_sm = select_num_cols_sm.iloc[:,select_cols_sm]

best_X_col_name_sm = select_cols_df_sm.columns
select_cols_df_sm.head()

In [None]:
# Combine categorical and non-categorical dataframe together
def filter_x_df_sm(x):
    df = x.copy()
    all_filter_col = []
    
    for keep in select_cat_col_name[select_cat_col_name.isin(keep_cols)]:
        filter_col = [col for col in df.columns if col.startswith(str(keep))]
        for col in filter_col : 
            if col not in keep_cols:
                all_filter_col.append(col)
                
    new_df = pd.concat([df[best_X_col_name_sm],df[all_filter_col]],axis = 1)
    return new_df

In [None]:
# Create filter_X_train_sm and filter_X_test_sm dataframe 
# Apply filter_x_df_sm function to X_train and X_test
filter_X_train_sm = filter_x_df_sm(X_train)
filter_X_test_sm = filter_x_df_sm(X_test)

In [None]:
#Standardize 
scaler = StandardScaler()  
scaler.fit(filter_X_train_sm)

X_train_scal_sm = scaler.fit_transform(filter_X_train_sm)
X_test_scal_sm = scaler.fit_transform(filter_X_test_sm)

In [None]:
# Train Logistic Regression model 
logisticRegr_sm = LogisticRegression()
logisticRegr_sm = logisticRegr_sm.fit(X_train_scal_sm, Y_train)

Lr_y_pred_train_sm = logisticRegr_sm.predict(X_train_scal_sm)
Lr_y_pred_test_sm = logisticRegr_sm.predict(X_test_scal_sm)

scores_lr_sm = cross_val_score(estimator = logisticRegr_sm, y = Y_train, X = X_train_scal_sm, cv=5)
print('Cross Validation Score:', np.mean(scores_lr_sm))

**Handling with too many categories**

As we can see, there are many attributes containing more than 20,000 categories. 
Although we did filter selection,some of categories in the filtered attribute might not significantly different and cause a prediction error. To reduce the number of categories, the bank should redesign the data collecting format.


In [None]:
cat_cols = df.select_dtypes(exclude=['number']).columns.values

for col in cat_cols:
    if df[col].value_counts().count() > 20000 : 
        print('Column {} has {} categories'.format(col,df[col].value_counts().count()))

**Handling with irrelevant columns**

The last way to improve model performance is dropping outlier and less relevant columns from training dataset based on their importance scores. But, dropping all outliers from a large dataset might not be a good idea, you can drop only outliers in the important features to improve model performance.


In [None]:
featurename = filter_X_train.columns
importances = list(rf.feature_importances_)

feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(featurename, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

print('Top 50 Importance Features\n')
[print('Variable: {} Importance Score: {}'.format(*pair)) for pair in feature_importances[:50]];

**Business Suggestion**

After the model performance reached a level as desired, the bank could release a personalized lending rate which calculated from the model predicting score. 

Due to the research, lack of financial literacy causes the loan default. After I have explored the dataset. I found that more than 90% of a customer's loan is not default. Therefore, I would like to suggest the bank to persuade the loan lending customers to use financial advisor service more. 
Besides reducing the loan default rate, the bank will reach more customer insight and can use them in   model training.


Reference : https://www.bot.or.th/Thai/MonetaryPolicy/MonetPolicyComittee/MPR/BOX_MPR/BOX_2_3_MPR_TH_Mar19.pdf

**Sample Submission**

In [None]:
test_df = pd.read_csv('../input/loan-default-prediction/test_v2.csv.zip')
test_df.head()

In [None]:
# Binary Encoding
encoder = ce.BinaryEncoder(cols = keep_cols)
bi_enc_df = encoder.fit_transform(test_df[keep_cols])
bi_col_name = bi_enc_df.columns

test_df = pd.concat([test_df,bi_enc_df],axis = 1)
test_df.head()

In [None]:
# Create select_test_df by drop some columns in test_df
select_test_df = pd.concat([test_df['id'],test_df[filter_X_train.columns]],axis = 1)
select_test_df.head()

In [None]:
# Check missing value
select_test_df.isnull().sum().value_counts()

In [None]:
# Impute missing value in numeric columns with median 
numeric_cols = select_test_df.select_dtypes(include=['number']).columns.values

for col in numeric_cols:
    if col in list(col_pct_miss_df.column_name):
        med = df[col].median()
        select_test_df[col] = select_test_df[col].fillna(med)
        
not_numeric_cols = select_test_df.select_dtypes(exclude=['number']).columns.values

for col in not_numeric_cols:
        mode = df[col].mode()
        select_test_df[col] = select_test_df[col].fillna(mode[0])

In [None]:
# Check missing value
select_test_df.isnull().sum().value_counts()

In [None]:
# Find columns that contain missing value
nan_columns = select_test_df.isna().any()
columns_with_nan = select_test_df.columns[nan_columns].tolist()
columns_with_nan

In [None]:
# Replace missing value with zero
select_test_df[columns_with_nan] = select_test_df[columns_with_nan].fillna(0)

In [None]:
select_test_df.head()

In [None]:
# Random Forest
test_df_rf = test_df.copy()
test_df_rf['loss'] = rf.predict(select_test_df.loc[:,select_test_df.columns != 'id'])
test_df_rf.head()

In [None]:
# Export sample_submission of random forest
sample_submission = test_df_rf[['id','loss']]
sample_submission.to_csv('sample_submission_rf.csv', index = False)