In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install --upgrade pip

In [None]:
!pip install sweetviz

In [None]:
!pip install pandas_profiling

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

# Data Viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go

# Stats & ML
from scipy import stats
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import balanced_accuracy_score
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Auto ML
import sweetviz as sv
from pandas_profiling import ProfileReport
#import scorecardpy as sc

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# read/load dataset
application = pd.read_csv('../input/credit-card-approval-prediction/application_record.csv')
credit = pd.read_csv('../input/credit-card-approval-prediction/credit_record.csv')

In [None]:
application.head()

In [None]:
credit.head()

In [None]:
# Helper Function - Missing data check
def missing_data(data):
    missing = data.isnull().sum()
    available = data.count()
    total = (missing + available)
    percent = (data.isnull().sum()/data.isnull().count()*100).round(4)
    return pd.concat([missing, available, total, percent], axis=1, keys=['Missing', 'Available', 'Total', 'Percent']).sort_values(['Missing'], ascending=False)

In [None]:
missing_data(application)

In [None]:
missing_data(credit)

**Observation:-**

* Duplicate entries are observed with same client number but different gender and other details. This seems unlikely and possible re-entry with corrected details
* Hence deleting first entries and keeping the latest ones

In [None]:
# drop duplicate entries
application_clean = application.drop_duplicates(subset=['ID'], keep='last')
print("Original Data= ",application.shape)
print("Cleaned Data = ",application_clean.shape)

In [None]:
credit['STATUS'].value_counts()

In [None]:
def percent(column,decimals=2):
    assert decimals >= 0
    return (round(column*100,decimals).astype(str) + "%")

In [None]:
# find the tenure for each entry
credit['Tenure'] = credit['MONTHS_BALANCE'].apply(lambda x : x*(-1))
credit.shape

In [None]:
credit = credit.sort_values(by=['ID','Tenure'],ascending=True)
credit.head(10)

In [None]:
credit_new = credit.groupby('ID').agg(max).reset_index()
credit_new = credit_new[['ID', 'MONTHS_BALANCE', 'Tenure', 'STATUS']]
credit_new.head()

In [None]:
# merge datasets application details with credit approval record
df = pd.merge(application_clean, credit_new, how='inner', on=['ID'])
df.head()

**Data Dictionary:-**
ID: Unique Id of the row in application record.
MONTHS_BALANCE: The number of months from record time.
STATUS: Credit status for this month.
X: No loan for the month
C: paid off that month
0: 1-29 days past due
1: 30-59 days past due
2: 60-89 days overdue
3: 90-119 days overdue
4: 120-149 days overdue
5: Overdue or bad debts, write-offs for more than 150 days

In [None]:
# replacing X & C by 0 in credit status
df['STATUS'].replace('X', 0, inplace=True)
df['STATUS'].replace('C', 0, inplace=True)
df['STATUS'] = df['STATUS'].astype('int')

percent(df['STATUS'].value_counts(normalize=True, sort=False),decimals=2)

**Reasons for Credit Card Rejection/Disapproval¶**
The reasons for a credit card rejection in India have been provided based on previous rejection experiences that applicant have faced and these reasons have found to be the most common across all major banks in the country.

* Low credit score
* Unstable employment
* Insufficient income
* Working in a delisted private company
* Not falling into the age limit
* Living in an address which is present in the defaulter list

**Reference:-**
[creditmantri](https://www.creditmantri.com/credit-card-rejection/)
[Balance](https://www.thebalance.com/denied-credit-card-application-960247)
[Late Payment](https://www.thebalance.com/when-does-a-late-payment-go-on-my-credit-report-960434)

Based on above reference information, applicants who have past record for delayed credit payment greater than 59 days will have their applications rejected

In [None]:
# create target variable from credit status
df['STATUS'] = df['STATUS'].apply(lambda x: 1 if x >= 2 else 0)
percent(df['STATUS'].value_counts(normalize=True, sort=False),decimals=2)

In [None]:
# drop months_balance column as it is redundant
df = df.drop(['MONTHS_BALANCE'], axis=1)
df.head()

In [None]:
#renaming the column headers for easy understanding

df.rename(columns={'ID': 'User_id', 'CODE_GENDER':'Gender',
                   'FLAG_OWN_CAR':'Car', 'FLAG_OWN_REALTY':'Realty_owned', 'CNT_CHILDREN':'Children_count',
                   'AMT_INCOME_TOTAL':'Income_amount', 'NAME_INCOME_TYPE':'Income_type', 'NAME_EDUCATION_TYPE':'Education',
                   'NAME_FAMILY_STATUS':'Family_status', 'NAME_HOUSING_TYPE':'Housing_type', 'DAYS_BIRTH':'Days_birth',
                   'DAYS_EMPLOYED':'Days_employed', 'FLAG_MOBIL':'Mobile', 'FLAG_WORK_PHONE':'Work_phone',
                   'FLAG_PHONE':'Phone', 'FLAG_EMAIL':'Email', 'OCCUPATION_TYPE':'Occupation_type',
                   'CNT_FAM_MEMBERS':'Family_members','Tenure':'Tenure', 'STATUS':'Reject_Status'}, inplace=True)
df.head(2)

In [None]:
# dropping feature "mobile" as all applicants own a mobile phone and hence the column "Mobile" seem to be of no importance. Hence it can be dropped
df = df.drop(['Mobile'], axis=1)
df.shape

In [None]:
# Fill missing values of feature occupation type with unknown
df["Occupation_type"].fillna("Unknown", inplace = True)
df["Occupation_type"].value_counts()

In [None]:
def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        ratio = len(df[col].value_counts()) / len(df)
        if ratio < 0.05:
            df[col] = df[col].astype('category')
    return df

In [None]:
#df = (df.pipe(to_category))
df.dtypes

In [None]:
# Analyzing the dataset using pandas profiling library
application_profile = ProfileReport(df, 
                        title='Pandas Profiling Report for Application Data', 
                        html={'style':{'full_width':True}}) 

application_profile.to_widgets()

In [None]:
# Analyzing the dataset using sweetviz library
advert_report = sv.analyze(df)

# Display the report
advert_report.show_html('CreditCard.html')

**Observation¶**
* All applicants in merged dataset own a mobile phone and hence the column "Mobile" seem to be of no importance. Hence it can be dropped
* Majority of tyhe applicants are female
* Occupation type column has 30% missing values and hence can either dropped or imputed (missing values as unknown)
* Most of the categorical columns/features seem to be binary in nature
* Major class imbalance observed. Only 0.26% of applicants belong to rejected category
* Income and number of family members seem to have extreme values/data points (possibly outliers)
* Outliers in income are found in approved class than rejected. Hence removing extreme values might not affect the model performance in detecting the reject cases

In [None]:
df.describe()

In [None]:
# Outlier treatment - IQR Method
def remove_outlier(col):
    q25 = col.quantile(0.25)
    q75 = col.quantile(0.75)
    iqr = q75 - q25
    cutoff = iqr*x
    lower = q25 - cutoff
    upper = q75 + cutoff
    return lower, upper

#Remove outliers for Income for 1.5 * IQR
x=1.5
lower_1, upper_1 = remove_outlier(df['Income_amount'])
df_IQR1 = df.loc[(df['Income_amount'] > lower_1) & (df['Income_amount'] < upper_1)]
plt.figure(figsize=(15,8))
ax1 = sns.boxplot(x="Gender", y="Income_amount", hue = "Reject_Status",data=df_IQR1)

In [None]:
#Remove outliers for children count for 1.5 * IQR
x=1.5
lower_2, upper_2 = remove_outlier(df_IQR1['Children_count'])
df_IQR1N = df_IQR1.loc[(df_IQR1['Children_count'] > lower_2) & (df_IQR1['Children_count'] < upper_2)]
plt.figure(figsize=(15,8))
ax4 = sns.boxplot(x="Family_members", y="Income_amount", hue = "Reject_Status", showfliers = True,data=df_IQR1N)

In [None]:
a = df.shape[0] - df_IQR1N.shape[0]
print("Outlier Count = ",a,"\nOutlier % = ",round(((a/df.shape[0])*100),2))

In [None]:
df_IQR1N.head(3)

In [None]:
df_lab = df_IQR1N.copy()
df_woe = df_IQR1N.copy()

In [None]:
# Now we will convert all the non-numeric values into numeric ones because it results in faster computation and also many machine learning models
# Label Encoding is used here
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in df_lab:
    if df_lab[col].dtypes == "object":
        df_lab[col] = le.fit_transform(df_lab[col])

In [None]:
df_lab.head(2)

In [None]:
# split into input and output elements
x_1  = df_lab.iloc[:, 1:-1]
y_1 = df_lab.iloc[:, -1]

In [None]:
x_1.head(2)

In [None]:
# Test Train Split for category encoding
x_1_train, x_1_test, y_1_train, y_1_test = train_test_split(x_1, y_1, test_size=0.3, random_state=1)

In [None]:
# Feature scaling using robust scalar
# Robust Scaler shrinks data to medians and quantiles, thus not influenced by outliers. Hence robust scalar will be used here
x_1_train_rs = pd.DataFrame(RobustScaler().fit_transform(x_1_train), columns=x_1_train.columns)
x_1_test_rs = pd.DataFrame(RobustScaler().fit_transform(x_1_test), columns=x_1_test.columns)

In [None]:
# Handle Class Imabalance using SMOTE Oversampling - Synthetic Minority Oversampling Technique
oversample = SMOTE()

a_target_0 = df_lab[df_lab['Reject_Status'] == 0].Reject_Status.count() / df_lab['Reject_Status'].count()
a_target_1 = df_lab[df_lab['Reject_Status'] == 1].Reject_Status.count() / df_lab['Reject_Status'].count()

print(round(a_target_0,4))
print(round(a_target_1,4))

In [None]:
#Apply to train data
x_1_train_rs_over, y_1_train_over = oversample.fit_resample(x_1_train_rs, y_1_train)
print(Counter(y_1_train_over))

In [None]:
#Apply to test data
x_1_test_rs_over, y_1_test_over = oversample.fit_resample(x_1_test_rs, y_1_test)
print(Counter(y_1_test_over))

In [None]:
x_1_train_rs_over.head()

In [None]:
# Model Building
classifiers = {
    "LogisticRegression" : LogisticRegression(),
    "KNeighbors" : KNeighborsClassifier(),
    "SVC" : SVC(),
    "DecisionTree" : DecisionTreeClassifier(),
    "RandomForest" : RandomForestClassifier(),
    "XGBoost" : XGBClassifier()
}

In [None]:
classifiers

In [None]:
train_scores = []
test_scores = []

for key, classifier in classifiers.items():
    classifier.fit(x_1_train_rs_over, y_1_train_over)
    train_score = round(classifier.score(x_1_train_rs_over, y_1_train_over),2)
    train_scores.append(train_score)
    test_score = round(classifier.score(x_1_test_rs_over, y_1_test_over),2)
    test_scores.append(test_score)

print(train_scores)
print(test_scores)

In [None]:
xgb = XGBClassifier()

In [None]:
xgb.fit(x_1_train_rs_over, y_1_train_over)

* The XBBoost classifier gives better results compared with other models as seen from above train/test scores
* Hyperparameter tuning can be attempted by executing below cross validation code for XGBoost classifier
* WOE and target encoding didnot yield any better results compared with one hot encoding

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'learning_rate': [0.05,0.1,0.15,0.2,0.25,0.3],
    'max_depth':[5,10,15],
    'min_child_weight':[6,8,10,12],
    'subsample': [0.6,0.7,0.8,0.9], 
    'colsample_bytree':[0.6,0.7,0.8],
    'gamma':[i/10.0 for i in range(0,5)]
 
}

In [None]:
'''# Grid Search
gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=354), 
 param_grid = params, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

#Fit
gsearch.fit(x_1_train_rs_over, y_1_train_over)'''

In [None]:
#gsearch.best_params_, gsearch.best_score_

In [None]:
#