In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

## Loading the Data

In [None]:
df = pd.read_csv("/kaggle/input/loan-prediction-based-on-customer-behavior/Training Data.csv")
df = df.drop("Id", axis=1)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# splitting data into train, validation and test sets to avoid data leakage

from sklearn.model_selection import train_test_split as tts
train, valid_test = tts(df.copy(), test_size=0.2, random_state=42)
valid, test = tts(valid_test.copy(), test_size=0.5, random_state=42)

In [None]:
# creating list of numerical and categorical features

num_cols = [column for column in train.columns if train.dtypes[column] == "int64"]
cat_cols = [column for column in train.columns if train.dtypes[column] == "object"]

print("Numerical Columns : " + str(num_cols))
print("Categorical Columns : " + str(cat_cols))

## Exploratory Data Analysis

### Numerical Features

In [None]:
# visualizing the distribution of all numerical features and the label

train.hist(figsize=(12, 9))
plt.show()

In [None]:
# plotting the correlation matrix using seaborn to check for correlation between numerical features

corr_mat = train.corr()
sns.heatmap(corr_mat, vmin=-1, vmax=1, center=0, annot=True)
plt.plot()

In [None]:
# checking for outliers
# replace 'Income' with any other numerical feature that you need to inspect and re-run the cell

plt.figure(figsize=(10,2))
sns.boxplot(data=train['Income'].values, orient='h') 
plt.plot()

# no feature shows any outliers in the boxplot

### Categorical Features

In [None]:
def print_count_cats(df, columns):
    '''
    print the number of categories in each categorical feature
    '''
    for column in columns:
        count = len(df[column].value_counts())
        print("{0} : {1}".format(column, count))

print_count_cats(train, cat_cols)

## Feature Engineering - Modifying our features

### Encoding

In [None]:
# dropping CITY as it has too many categories. 
# Using one hot encoding will add 317 more features and may lead to the curse of dimensionality
# train = train.drop(["CITY"], axis=1)
# valid = valid.drop(["CITY"], axis=1)
# test = test.drop(["CITY"], axis=1)

In [None]:
# ordinal encoding on 'Married/Single' feature
from sklearn.preprocessing import OrdinalEncoder

marital_enc = OrdinalEncoder(categories=[['single', 'married']])
train['Married/Single'] = marital_enc.fit_transform(train['Married/Single'].values.reshape(-1, 1))
valid['Married/Single'] = marital_enc.transform(valid['Married/Single'].values.reshape(-1, 1))
test['Married/Single'] = marital_enc.transform(test['Married/Single'].values.reshape(-1, 1))

In [None]:
# ordinal encoding on 'House_Ownership'
# order between the categories may be percieved as norent_noown < rented < owned
house_enc = OrdinalEncoder(categories=[['norent_noown', 'rented', 'owned']])
train['House_Ownership'] = house_enc.fit_transform(train['House_Ownership'].values.reshape(-1, 1))
valid['House_Ownership'] = house_enc.transform(valid['House_Ownership'].values.reshape(-1, 1))
test['House_Ownership'] = house_enc.transform(test['House_Ownership'].values.reshape(-1, 1))

In [None]:
# ordinal encoding on 'Car_Ownership'
car_enc = OrdinalEncoder(categories=[['no', 'yes']])
train['Car_Ownership'] = car_enc.fit_transform(train['Car_Ownership'].values.reshape(-1, 1))
valid['Car_Ownership'] = car_enc.transform(valid['Car_Ownership'].values.reshape(-1, 1))
test['Car_Ownership'] = car_enc.transform(test['Car_Ownership'].values.reshape(-1, 1))

In [None]:
# prof_enc = OrdinalEncoder()
# train['Profession'] = prof_enc.fit_transform(train['Profession'].values.reshape(-1, 1))
# test['Profession'] = prof_enc.transform(test['Profession'].values.reshape(-1, 1))

In [None]:
# state_enc = OrdinalEncoder()
# train['STATE'] = prof_enc.fit_transform(train['STATE'].values.reshape(-1, 1))
# test['STATE'] = prof_enc.transform(test['STATE'].values.reshape(-1, 1))

In [None]:
# printing shapes of dataframes before one hot encoding
print(train.shape)
print(valid.shape)
print(test.shape)

In [None]:
# one hot encoding of 'Profession', 'STATE' using pandas
df_combined = pd.get_dummies(train.append([valid, test]), columns=['Profession', 'STATE', 'CITY'],
                            drop_first=True)
train = df_combined.iloc[:train.shape[0], :].copy()
valid = df_combined.iloc[train.shape[0]: train.shape[0] + valid.shape[0], :]
test = df_combined.iloc[train.shape[0] + valid.shape[0]:, :].copy()

In [None]:
# printing shapes of dataframes after one hot encoding to ensure spilts were made correctly
print(train.shape)
print(valid.shape)
print(test.shape)

In [None]:
# seperating the features and the label

train_y = train['Risk_Flag'].copy()
train = train.drop('Risk_Flag', axis=1)

valid_y = valid['Risk_Flag'].copy()
valid = valid.drop('Risk_Flag', axis=1)

test_y = test['Risk_Flag'].copy()
test = test.drop('Risk_Flag', axis=1)

### Handling Imbalanced Dataset

In [None]:
# oversampling

from collections import Counter
from imblearn.over_sampling import RandomOverSampler

sampler = RandomOverSampler(sampling_strategy=0.25)

print("Before sampling: " + str(Counter(train_y)))
train, train_y = sampler.fit_resample(train, train_y)
print("After sampling: " + str(Counter(train_y)))

In [None]:
# undersampling

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler(sampling_strategy=1.0)

print("Before sampling: " + str(Counter(train_y)))
train, train_y = sampler.fit_resample(train, train_y)
print("After sampling: " + str(Counter(train_y)))

In [None]:
# # SMOTE

# from collections import Counter
# from imblearn.over_sampling import SMOTE

# smote = SMOTE()

# print("Before sampling: " + str(Counter(train_y)))
# train, train_y = smote.fit_resample(train, train_y)
# print("After sampling: " + str(Counter(train_y)))

### Standardization

In [None]:
# performing scaling on all the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_X = scaler.fit_transform(train)
valid_X = scaler.transform(valid)
test_X = scaler.transform(test)

## Creating Models

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, roc_auc_score, recall_score

# hyperparameter tuning of xgboost using optuna 

def objective(trial):
    # setting search space
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)
    n_estimators = trial.suggest_int("n_estimators", 1000, 5000)
    
    # defining the model
    clf = XGBClassifier(learning_rate=learning_rate, 
                        reg_lambda=reg_lambda,
                        subsample=subsample, 
                        colsample_bytree=colsample_bytree, 
                        tree_method='gpu_hist', predictor="gpu_predictor", # using gpu to speed up the process
                        max_depth=max_depth, 
                        n_estimators=n_estimators, 
                        use_label_encoder=False,
                        random_state=42)
    
    clf.fit(train_X, train_y, eval_metric='logloss')
    valid_preds = clf.predict(valid_X)
    score = roc_auc_score(valid_y, valid_preds)
    
    return score

In [None]:
# # This cell is run only when hypertuning using OPTUNA

# # creating study object
# study = optuna.create_study(direction="maximize")

# # optimising the study object
# study.optimize(objective, n_trials=100)

# # Print the result
# best_params = study.best_params
# best_score = study.best_value
# print(f"Best score: {best_score}\n")
# print(f"Optimized parameters: {best_params}\n")

In [None]:
# Optuna results

# Best score: 0.7530063367504715
# scored on roc auc
params = {'learning_rate': 0.22657857685769822, 
                       'reg_lambda': 5.8263201980534444e-08, 
                       'reg_alpha': 6.30631361255538e-05, 
                       'subsample': 0.3678211180860871, 
                       'colsample_bytree': 0.8672653738124343, 
                       'max_depth': 7, 
                       'n_estimators': 4973}


# performance on test set
# [[20760  1362]
#  [ 1405  1673]]
# recall: 0.5435347628330085
# precision: 0.5512355848434926
# f1_score: 0.5473580893178474
# accuracy_score: 0.8901984126984127
# ROC AUC score: 0.74098354632022

In [None]:
# measuring performance of our model on the test set
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score


xgb = XGBClassifier(random_state=42, use_label_encoder=False,
                    tree_method='gpu_hist', **params)

xgb.fit(train_X, train_y, eval_metric='logloss')
y_pred = xgb.predict(valid_X)

cm = confusion_matrix(valid_y, y_pred)
print(cm)
print("recall: " + str(recall_score(valid_y, y_pred)))
print("precision: " + str(precision_score(valid_y, y_pred)))
print("f1_score: " + str(f1_score(valid_y, y_pred)))
print("accuracy_score: " + str(accuracy_score(valid_y, y_pred)))
print("ROC AUC score: " + str(roc_auc_score(valid_y, y_pred)))

In [None]:
# using optuna for lightgbm hypertuning

In [None]:
# using lightGBM

# from lightgbm import LGBMClassifier

# lgbm_clf = LGBMClassifier()
# lgbm_clf.fit(train_X, train_y)
# y_pred = lgbm_clf.predict(test_X)

# cm = confusion_matrix(test_y, y_pred)
# print(cm)
# print("recall: " + str(recall_score(test_y, y_pred)))
# print("precision: " + str(precision_score(test_y, y_pred)))
# print("f1_score: " + str(f1_score(test_y, y_pred)))
# print("accuracy_score: " + str(accuracy_score(test_y, y_pred)))
# print("ROC AUC score: " + str(roc_auc_score(test_y, y_pred)))

In [None]:
# saving the model in a pickle file

import pickle

filename = 'xgb_classifier.pkl'
with open(filename, 'wb') as file:
    pickle.dump(xgb, file)

## Creating the final pipeline

### Creating fresh datasets to train and test the pipeline

In [None]:
# Creating new dataset splits
final_train, final_test = tts(df.copy(), test_size=0.1, random_state=42)

# seperating the features and the label
final_train_y = final_train['Risk_Flag'].copy()
final_train_X = final_train.drop('Risk_Flag', axis=1)
final_test_y = final_test['Risk_Flag'].copy()
final_test_X = final_test.drop('Risk_Flag', axis=1)

# Sampling the final training set
oversampler = RandomOverSampler(sampling_strategy=0.25)
print("Before over sampling: " + str(Counter(final_train_y)))
final_train_X, final_train_y = oversampler.fit_resample(final_train_X, final_train_y)
print("After over sampling: " + str(Counter(final_train_y)))

undersampler = RandomUnderSampler(sampling_strategy=1.0)
print("Before under sampling: " + str(Counter(final_train_y)))
final_train_X, final_train_y = undersampler.fit_resample(final_train_X, final_train_y)
print("After under sampling: " + str(Counter(final_train_y)))

In [None]:
final_train_X.shape

In [None]:
final_test_X.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

marital_enc = OrdinalEncoder(categories=[['single', 'married']])

# order between the categories may be percieved as norent_noown < rented < owned
house_enc = OrdinalEncoder(categories=[['norent_noown', 'rented', 'owned']])

car_enc = OrdinalEncoder(categories=[['no', 'yes']])

enc1hot_profession = enc = OneHotEncoder(handle_unknown='error', drop='first', sparse=False)
enc1hot_state = enc = OneHotEncoder(handle_unknown='error', drop='first', sparse=False)
enc1hot_city = enc = OneHotEncoder(handle_unknown='error', drop='first', sparse=False)


scaler = StandardScaler()

transformers = [
    ('marital_enc', marital_enc, ['Married/Single']),
    ('house_enc', house_enc, ['House_Ownership']),
    ('car_enc', car_enc, ['Car_Ownership']),
    ('enc1hot_profession', enc1hot_profession, ['Profession']),
    ('enc1hot_state', enc1hot_state, ['STATE']),
    ('enc1hot_city', enc1hot_city, ['CITY']),
]

encoding_transformer = ColumnTransformer(transformers=transformers, 
                                        remainder='passthrough', n_jobs=-1)

steps = [
    ('encoding_transformer', encoding_transformer),
    ('scaler', scaler),
    ('xgb_clf', xgb)
]

full_pipeline = Pipeline(steps=steps, verbose=True)

In [None]:
# training using the full pipeline

full_pipeline.fit(final_train_X, final_train_y)

In [None]:
# testing the performance of our pipeline

final_y_pred = full_pipeline.predict(final_test_X)

cm = confusion_matrix(final_test_y, final_y_pred)
print(cm)
print("Recall: " + str(recall_score(final_test_y, final_y_pred)))
print("Precision: " + str(precision_score(final_test_y, final_y_pred)))
print("F1 Score: " + str(f1_score(final_test_y, final_y_pred)))
print("Accuracy Score: " + str(accuracy_score(final_test_y, final_y_pred)))
print("ROC AUC score: " + str(roc_auc_score(final_test_y, final_y_pred)))

In [None]:
# saving pipeline in pickle file

filename = 'full_pipeline.pkl'
with open(filename, 'wb') as file:
    pickle.dump(full_pipeline, file)