# This dataset classifies people described by a set of attributes as good or bad credit risks.

## Features
Default : Target variable                                                                                               
duration in month : loan duration                                                                                        
purpose : purpose of loan                                                                                              
credit_amount : loan amount                                                                                            
installment as income perc : emi to income percent                                                                      
Gender : male or female                                                                                                
age : age of the borrower                                                                                              
people under maintenance : number of dependents


In [1]:
## import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

In [2]:
# set the working directory
os.chdir("D:\Reg")

In [None]:
# read the csv data set
data = pd.read_csv('Credit_data.csv')

In [None]:
# look at the top 5 records of the data set
data.head()

In [None]:
# look at the structure of the data
data.info()

In [None]:
# check if there are any missing values or not
data.isnull().sum()

In [None]:
# look at the summary of the data
data.describe(include='all')

In [None]:
# check the count of each category in dependent variable
sns.countplot(x='default',data=data)
plt.show()
print(data['default'].value_counts(normalize=True) * 100)

In [None]:
# separate the numeric variables
numeric_data = data[['duration_in_month','credit_amount','installment_as_income_perc','present_res_since','age','people_under_maintenance']]

In [None]:
# separate the categorical variables
cat_data = data[['default','purpose','Gender','job']]

## EDA


In [None]:
def scatters(data, h=None):
    fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize=(8,8))
    sns.scatterplot(x="credit_amount",y="duration_in_month", hue=h, data=data, ax=ax1)
    sns.scatterplot(x="age",y="credit_amount", hue=h, data=data, ax=ax2)
    sns.scatterplot(x="age",y="duration_in_month", hue=h, data=data, ax=ax3)
    plt.tight_layout()
scatters(data,h='Gender')

## Grant credit per purpose

In [None]:
n_credits = data["purpose"].value_counts().rename("Count").reset_index()
n_credits.sort_values(by=["Count"], ascending=False, inplace=True)
n_credits.rename(columns={'index':'purpose'}, inplace=True)
plt.figure(figsize=(10,6))
bar = sns.barplot(x="purpose",y="Count",data=n_credits)
bar.set_xticklabels(bar.get_xticklabels(), rotation=60)
plt.ylabel("Number of granted credits")
plt.tight_layout()

## spread of credit amount per pupose

In [None]:
def boxes(x,y,r=45):
    fig, ax = plt.subplots(figsize=(13,6))
    box = sns.boxplot(x=x,y=y, data=data)
    box.set_xticklabels(box.get_xticklabels(), rotation=r)
    fig.subplots_adjust(bottom=0.2)
    plt.tight_layout()
boxes("purpose","credit_amount")

## spread of credit amount per pupose by gender

In [None]:
def boxes(x,y,h,r=45):
    fig, ax = plt.subplots(figsize=(13,6))
    box = sns.boxplot(x=x,y=y,hue=h, data=data)
    box.set_xticklabels(box.get_xticklabels(), rotation=r)
    fig.subplots_adjust(bottom=0.2)
    plt.tight_layout()
boxes("purpose","credit_amount",'Gender')

# One hot encoding (dummy coding)

In [None]:
purposedf=pd.get_dummies(data['purpose'],drop_first=True)
Genderdf=pd.get_dummies(data['Gender'],drop_first=True)
jobdf=pd.get_dummies(data['job'],drop_first=True)

In [None]:
df = pd.merge(purposedf, Genderdf, left_index=True, right_index=True)
df1 = pd.merge(df, jobdf, left_index=True, right_index=True)
df1.head()

In [None]:
# Exclude outliers
#IV = numeric_data.describe()
#IV = IV.reset_index(drop=True)
#limIV = len(numeric_data)
#numeric_data=numeric_data.reset_index(drop=True)
#for i in range(0,len(numeric_data.columns)):
#    twenty_five=IV.iloc[4][i]
#    seventy_five=IV.iloc[6][i]
#    iqr=seventy_five-twenty_five
#    max_r=seventy_five+1.5*iqr
#    min_r=twenty_five-1.5*iqr
#    for j in range(0,limIV):
#        if numeric_data.iloc[j,i]>max_r:
#            numeric_data.iloc[j,i]=max_r
#        if numeric_data.iloc[j,i]<min_r:
#            numeric_data.iloc[j,i]=min_r

# Normalize the data

In [None]:
from sklearn import preprocessing

In [None]:
normalized_X = pd.DataFrame(preprocessing.normalize(numeric_data))

In [None]:
normalized_X.columns = numeric_data.columns

In [None]:
# merge normalized data with the dummy coded data
final_df = pd.merge(df1,normalized_X,left_index=True, right_index=True)
final_df = pd.merge(final_df,data['default'],left_index=True, right_index=True)
final_df.head()

In [None]:
final_df.shape

In [None]:
# divide the data in DV and IV and then make splits for training and testing purpose
X = final_df.drop('default',axis=1).values
Y = final_df['default'].values
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

In [1]:
# check the proportion of 1's and 0's in training set
sns.countplot(y_train)
plt.show()
print(pd.Series(y_train).value_counts(normalize=True) * 100)

NameError: name 'sns' is not defined

In [None]:
#  check the proportion of 1's and 0's in test set
sns.countplot(y_test)
plt.show()
print(pd.Series(y_test).value_counts(normalize=True) * 100)

In [None]:
#Setup arrays to store training and test accuracies
neighbors = np.arange(1,25)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    #Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    #Fit the model
    knn.fit(x_train, y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(x_train, y_train)
    
    #Compute accuracy on the test set
    test_accuracy[i] = knn.score(x_test, y_test) 

In [None]:
print(train_accuracy)
print(test_accuracy)

In [None]:
# Plot of Accuracy and K values
plt.title('k-NN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
#Setup a knn classifier with k neighbors
knn_15 = KNeighborsClassifier(n_neighbors=15)
knn_15.fit(x_train,y_train)
#Get accuracy. Note: In case of classification algorithms score method represents accuracy.
knn_15.score(x_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred15 = knn_15.predict(x_test)
y_pred_train = knn_15.predict(x_train)
confusion_matrix(y_test,y_pred15)

pd.crosstab(y_test, y_pred15, rownames=['True'], colnames=['Predicted'])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred15))

# Recall is very low because of the highly class imbalance
# Perform undersampling to exclude the record from majority class

In [None]:
data2=final_df[final_df['default']==0]
data3=data2.sample(n=400, random_state=1)
data3[data3['default']==0].shape

In [None]:
# append all the records of category 1 to undersampled data of category 0
data4 = data3.append(final_df[final_df['default']==1])
data4.shape

In [None]:
# let's look at the proportion of default
data4['default'].value_counts()

In [None]:
X1 = data4.drop('default',axis=1)
Y1 = data4['default']
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1 = train_test_split(X1,Y1,test_size=0.3,random_state=42)

In [None]:
#Setup arrays to store training and test accuracies
neighbors1 = np.arange(1,25)
train_accuracy1 =np.empty(len(neighbors1))
test_accuracy1 = np.empty(len(neighbors1))

for i,k in enumerate(neighbors1):
    #Setup a knn classifier with k neighbors
    knn1 = KNeighborsClassifier(n_neighbors=k)
    
    #Fit the model
    knn1.fit(x_train1, y_train1)
    
    #Compute accuracy on the training set
    train_accuracy1[i] = knn1.score(x_train1, y_train1)
    
    #Compute accuracy on the test set
    test_accuracy1[i] = knn1.score(x_test1, y_test1) 

In [None]:
print(train_accuracy1)
print(test_accuracy1)

In [None]:
plt.title('k-NN Varying number of neighbors')
plt.plot(neighbors1, test_accuracy1, label='Testing Accuracy')
plt.plot(neighbors1, train_accuracy1, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
#Setup a knn classifier with k neighbors as 14 as the accuracy is almost same and beyond that point accuracy decreases
knn14 = KNeighborsClassifier(n_neighbors=14)
knn14.fit(x_train1,y_train1)

#15,14,12,10

In [None]:
knn14.score(x_test1,y_test1)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred14 = knn14.predict(x_test1)
y_pred_train14 = knn14.predict(x_train1)
confusion_matrix(y_test1,y_pred14)

pd.crosstab(y_test1, y_pred14, rownames=['True'], colnames=['Predicted'])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test1,y_pred14))

## Parameter Tunning

# Select the best parameters(hyper parameters) for KNN 

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# 10 fold cross validation for k = 16
knn_cv = KNeighborsClassifier(n_neighbors=14)
scores = cross_val_score(knn_cv,X,Y,cv=10,scoring='accuracy')
print(scores)


In [None]:
# Average accuracy as an out of sample accuracy
print(scores.mean())

## Again search for the best value of K using cross validation

In [None]:
k_range = range(1,35)
k_scores = []
for k in k_range:
    knn_cv_rg = KNeighborsClassifier(n_neighbors=k)
    scores_rg = cross_val_score(knn_cv_rg,X,Y,cv=10,scoring='accuracy')
    k_scores.append(scores_rg.mean())
print(k_scores)    

In [None]:
# check for the max value of k_score (accuracy)
np.array(k_scores).argmax()
k_scores[11]

In [None]:
# plot the value of k versus cross validated accuracy
plt.plot(k_range,k_scores)
plt.xlabel('Range of K')
plt.ylabel('Cross Validated Accuracy')
plt.show()

In [None]:
#Setup a knn classifier with k neighbors
knn_11 = KNeighborsClassifier(n_neighbors=11)
knn_11.fit(x_train1,y_train1)
knn.score(x_test1,y_test1)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred11 = knn_11.predict(x_test1)
y_pred_train11 = knn_11.predict(x_train1)
confusion_matrix(y_test1,y_pred11)

pd.crosstab(y_test1, y_pred11, rownames=['True'], colnames=['Predicted'])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test1,y_pred11))

## More efficient parameter tunning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# define the parameter value that should be searched
k_range_gs = range(1,35)

In [None]:
# create a parameter grid : map the parameter names to the values
param_grd = dict(n_neighbors = k_range_gs)

In [None]:
knn_gs = KNeighborsClassifier()
knn_cv_gs= GridSearchCV(knn_gs,param_grd,cv=10,scoring='accuracy')
knn_cv_gs.fit(X1,Y1)

In [None]:
# view the complete results
results=pd.DataFrame(knn_cv_gs.cv_results_)
results1 = results[['params','mean_test_score','std_test_score','rank_test_score']]
results

In [None]:
results1.head()

In [None]:
plt.plot(k_range_gs,results1['mean_test_score'])
plt.xlabel('Range of K')
plt.ylabel('Cross Validated Accuracy')
plt.show()

## Examine the best model

In [None]:
#print(knn_cv_gs.best_estimator_)
#print(knn_cv_gs.best_params_)
#print(knn_cv_gs.best_score_)
print(knn_cv_gs.best_index_)

In [None]:
#Setup a knn classifier with k neighbors
knn_10 = KNeighborsClassifier(n_neighbors=9)
knn_10.fit(x_train1,y_train1)
#Get accuracy. Note: In case of classification algorithms score method represents accuracy.
knn_10.score(x_test1,y_test1)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred10 = knn_10.predict(x_test1)
y_pred_train10 = knn_10.predict(x_train1)
confusion_matrix(y_test1,y_pred10)

pd.crosstab(y_test1, y_pred10, rownames=['True'], colnames=['Predicted'])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test1,y_pred10))