## Credit Card Default Prediction

In [None]:
### importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
sns.set()

### Importing the dataset
1. My dataset is in CSV file, and we will pandas read_csv to load the data.
2. Checking the complete information.
3. We will check the null values and deal with it accordingly.
4. Describe the data.


In [None]:
data = pd.read_csv('UCI_Credit_Card.csv')
pd.set_option('display.max_columns', None)
data.head()

In [None]:
data.info()

### THere are no null values in our dataset, and every column is numeric in nature.

In [None]:
data.nunique()

### In this we can see that, marriage and education column contains, an extra unique value, if we go by the dataset, given on this link.
### https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset
### So I will rectify this.

In [None]:
data['default.payment.next.month'].value_counts()

### This is a imbalanced data, we will deal with this later. 

In [None]:
data['EDUCATION'].unique()

### I will convert 6 and 0 values to 5, as these represents unknown.

In [None]:
data['EDUCATION'].replace({6:5, 0:5}, inplace=True)

In [None]:
data['MARRIAGE'].unique()

### I will convert 0 to 3, as it represents others.

In [None]:
data['MARRIAGE'].replace({0:3}, inplace=True)

## EDA
### 1. Dropping column ID.
### 2. Checking the relationships between variables through heatmap ans pair plot.
### 3. Checking the distributions of some columns.
### 3. Creating a new Dataframe, with columns like Sex, Education, Marriage, and Default, to perform visualizations after converting them into categorical variable.

In [None]:
data.drop(['ID'], axis = 1, inplace =True)

In [None]:
data.head()

In [None]:
relationDataFrame = pd.DataFrame(data, columns = ['LIMIT_BAL',	'SEX',	'EDUCATION',	'MARRIAGE',	'AGE',	'PAY_0',	'PAY_2'
                                                  ,'BILL_AMT1',	'BILL_AMT2',	'BILL_AMT3',	'BILL_AMT4','default.payment.next.month'])
sns.pairplot(relationDataFrame.sample(3000))

### We can see that, non  of the columns has exact linear relationship. 

In [None]:
plt.figure(figsize = (17,10))
sns.heatmap(data.corr(), annot = True, fmt = '.2f')

### We can see that, different columns of balance amount have the highest correlations.

In [None]:
## Renaming the column
data.rename(columns = {'default.payment.next.month':'def_pay'}, inplace=True)

In [None]:
## Checking distribution of variables
dnum = ['AGE', 'LIMIT_BAL', 'PAY_0', 'PAY_2', 'BILL_AMT1', 'BILL_AMT2', 'def_pay']

plt.figure(figsize=(17,10))
for i, column in enumerate(dnum, 1):
    plt.subplot(2,4,i)
    sns.distplot(data[column])
    plt.xlabel(column)

### We can see that none of the columns follow normal distributions.

In [None]:
X = pd.DataFrame(data, columns = ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'def_pay'])
X.head()

In [None]:
X['SEX'] = X['SEX'].astype('object')
X['EDUCATION'] = X['EDUCATION'].astype('object')
X['MARRIAGE'] = X['MARRIAGE'].astype('object')

In [None]:
X.info()

In [None]:
sex = {1:'MALE', 2: 'FEMALE'}
ed = {1:'graduate school', 2:'university', 3:'high school', 4:'others', 5:'unknown'}
mar = {1:'married', 2:'single', 3:'others'}
X['SEX'].replace(sex, inplace = True)
X['EDUCATION'].replace(ed, inplace = True)
X['MARRIAGE'].replace(mar, inplace = True)
X.head()

In [None]:
sns.barplot(x = 'def_pay', y ='AGE', data = X )

#### We can see that average age of defaulters is nearly equal to non - defaulters.

In [None]:
plt.figure(figsize=(7,7))
sns.barplot(x = 'EDUCATION', y = 'AGE', hue = 'def_pay', data = X)
plt.legend(loc = 'upper left')

### We can observe that for both defaulters, and non - defaulters on a whole, the average age is less where education is others, and it is the highest where education is high school.

In [None]:
## Cat plot
sns.catplot(x = 'MARRIAGE', y = 'AGE', hue = 'def_pay', data  = X, kind = 'boxen', height=6, aspect = 3)

### We can observe that, median of age, for married people and others for both defaulters, and non - defaulters is nearly same.

### As our dataset is imbalanced, we will use different Performance metrics. 
- In this I will use performance metric like Confusion Matrix and classification report, to conclude my findings. 

In [None]:
### Seperating the dependent feature and independent features
dep = data.drop(['def_pay'], axis = 1)
indep = data['def_pay']

In [None]:
dep.head()

In [None]:
indep.head()

In [None]:
## Creating train and test dataset
X_train, X_test, Y_train, Y_test = train_test_split(dep, indep, test_size=0.3, random_state= 42, stratify=indep)

In [None]:
X_train.head()

### Feature Selection
- ANOVA is a good measure, which is used when input variables are numerical and output variable is categorical in nature.

In [None]:
## I am using ANOVA f-measure
fs = SelectKBest(score_func=f_classif, k = 15)
X_train_new = fs.fit_transform(X_train, Y_train)

In [None]:
X_train_new

In [None]:
## Checking the  scores for the features
for i, feature in enumerate(X_train.columns, 0):
	print('Feature %s: %f' % (feature, fs.scores_[i]))

### The features I am considering are LIMIT_BAL, SEX, PAY_0, PAY_2, PAY_3, PAY_4, PAY_5, PAY_6, BILL_AMT1, PAY_AMT1, PAY_AMT2, PAY_AMT3, PAY_AMT4, PAY_AMT5, and PAY_AMT6 for building our model.

In [None]:
X_test_new = fs.transform(X_test)

In [None]:
###  Scaling the data
sc = StandardScaler()
X_train_new = sc.fit_transform(X_train_new)


In [None]:
X_train_new

In [None]:
X_test_new = sc.transform(X_test_new)
X_test_new.shape

In [None]:
file = open('SC.pkl', 'wb')
pickle.dump(sc, file)
file.close()

### Fitting the model 
- Here I am, using 2 models to compare namely, Logistic Regression and Random Forest. 
- Moreover, I am also applying Hyperparameter Tuning to my models with the help of pipelines.

In [None]:
pipe = Pipeline([('classifier', LogisticRegression)])

grid_param = [
            { 'classifier':[LogisticRegression()],
              'classifier__penalty':['l2'],
              'classifier__C':np.logspace(0, 4, 10),
              'classifier__solver':['newton-cg','saga','sag','liblinear']                
            },
            {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]
                 }
            
]

gridsearch = GridSearchCV(pipe, grid_param, cv = 5, verbose=1, n_jobs=-1 )
best_model = gridsearch.fit(X_train_new, Y_train)
 

In [None]:
best_model.best_params_

### We can see that Random Forest Classifier is chosen as the best model with the specified values of parameters as given value. 

In [None]:
rf_pred = best_model.predict(X_test_new)
rf_pred

In [None]:
cm_rf = confusion_matrix(Y_test, rf_pred)
plt.figure(figsize = (8,8))
sns.heatmap(cm_rf, annot = True, fmt = '.3f', square = True)
plt.ylabel('Actual label')
plt.xlabel('Predicted label') 

### From this we can see the true positives values of different classes. To understand better, let's see classification report.

In [None]:
## Printing Classification Report 
print(classification_report(Y_test, rf_pred))

### From this we can observe that, the accuracy or our model is 81% which is good. Let's dive deeper into this, we observe that precision and recall values for class 0 are very good but on the other hand we can't say the same for class 1. My main aim in this project is to correctly classify class 1 values, which means I want to focus more on recall value for class 1. Moreover, F1-score for class 1 is very less, so let's see if we can increase this too.

In [None]:
## We can see that the above model works good for class 0, but not that good for class 1. so we are trying to implement an another algorithm called XGBoost.
grid_param2 = [
            { 'classifier':[XGBClassifier()],
              'classifier__learning_rate':[0.01, 0.1],
              'classifier__max_depth':[3, 5, 7, 10],
              'classifier__sub_sample':[0.5, 0.7]                
            }
]
gridsearch = GridSearchCV(pipe, grid_param2, cv = 5, verbose=1, n_jobs=-1 )
model2 = gridsearch.fit(X_train_new, Y_train)
 


In [None]:
model2.best_params_

### From the above snippet, we can see that the Xgboost model works best with above parameter values. 
### Let's see its confusion matrix and classification report. 

In [None]:
xg_pred = model2.predict(X_test_new)
cm_xg = confusion_matrix(Y_test, xg_pred)
plt.figure(figsize = (8,8))
sns.heatmap(cm_xg, annot = True, fmt = '.3f', square = True)
plt.ylabel('Actual label')
plt.xlabel('Predicted label') 

In [None]:
## Printing Classification Report 
print(classification_report(Y_test, xg_pred))

### We can observe that, this model works a little better than the Random Forest model as both recall and f1-score value of our model has increased. 
### Now I am trying to increase the recall value and f1-score much more. Hence, I am using a technique called Random Over Sampling to create more data points of class 1 in my training set. The ratio I am choosing is 0.75.

In [None]:
## To improve it, we will do random oversampling and then compare different models
os = RandomOverSampler(0.75)
X_train_ns, Y_train_ns = os.fit_resample(X_train_new, Y_train)
print("The number of classes before fit {}".format(Counter(Y_train)))
print("The number of classes after fit {}".format(Counter(Y_train_ns)))


In [None]:
## First implementing RandomForest
pipe2 = Pipeline([('classifier', RandomForestClassifier)])

grid_param3 =[
            {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25],
                 "classifier__min_samples_leaf":[1,2,5,10,15],
                 "classifier__max_leaf_nodes": [2, 5,10]
                 }
            
]

gridsearch = GridSearchCV(pipe2, grid_param3, cv = 5, verbose=1, n_jobs=-1 )
model3 = gridsearch.fit(X_train_ns, Y_train_ns)

In [None]:
model3.best_params_

In [None]:
rf_pred2 = model3.predict(X_test_new)
cm_rf2 = confusion_matrix(Y_test, rf_pred2)
plt.figure(figsize = (8,8))
sns.heatmap(cm_rf2, annot = True, fmt = '.3f', square = True)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
## Printing Classification Report 
print(classification_report(Y_test, rf_pred2))

### After applying the sampling technique, we can see that the Random Forest Model works much better and now the recall and f1-score values for class 1 has also increased. Now I am trying to apply Xgboost to see if I can increase the values much further.

In [None]:
grid_param4 = [
            { 'classifier':[XGBClassifier()],
              'classifier__learning_rate':[0.01, 0.1],
              'classifier__max_depth':[3, 5, 7, 10],
              'classifier__sub_sample':[0.5, 0.7]                
            }
]
gridsearch = GridSearchCV(pipe, grid_param4, cv = 5, verbose=1, n_jobs=-1 )
model4 = gridsearch.fit(X_train_ns, Y_train_ns)

In [None]:
model4.best_params_

In [None]:
xg_pred2 = model4.predict(X_test_new)
cm_xg2 = confusion_matrix(Y_test, xg_pred2)
plt.figure(figsize = (8,8))
sns.heatmap(cm_xg2, annot = True, fmt = '.3f', square = True)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
## Printing Classification Report 
print(classification_report(Y_test, xg_pred2))

### From the obove figures, we can see that the Xgboost model is not performing better than the previous Random Forest Model.
### After observing all the models I have created, I found out that Random Forest built after sampling technique performs much better than the rest. So, I will use that for deployement.  

In [None]:
### Doing pickling on model3 for deployement
file = open('model3.pkl', 'wb')
pickle.dump(model3, file)