In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import requests
import io
import matplotlib.pyplot as plt

In [None]:
path = "https://raw.githubusercontent.com/rajdeepsaha0809/Will-the-Credit-Card-Default-/main/UCI_Credit_Card.csv"
download = requests.get(path).content
df = pd.read_csv(io.StringIO(download.decode('utf-8')))
df.head()

In [None]:
df.describe()

### Data Description
    
    ID: ID of each client
    LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
    SEX: Gender (1=male, 2=female)
    EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
    MARRIAGE: Marital status (1=married, 2=single, 3=others)
    AGE: Age in years
    PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months,... 8=payment delay for eight months, 9=payment delay for nine months and above)
    PAY_2: Repayment status in August, 2005 (scale same as above)
    PAY_3: Repayment status in July, 2005 (scale same as above)
    PAY_4: Repayment status in June, 2005 (scale same as above)
    PAY_5: Repayment status in May, 2005 (scale same as above)
    PAY_6: Repayment status in April, 2005 (scale same as above)
    BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
    BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
    BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
    BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
    BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
    BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
    PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
    PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
    PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
    PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
    PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
    PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
    default.payment.next.month: Default payment (1=yes, 0=no)


#### Checking for Missing Values 

In [None]:
df.isna().sum() 
#There are no missing values

In [None]:
df['default.payment.next.month'].value_counts()/300

In [None]:
df.columns

In [None]:
df.rename(columns={'PAY_0':'PAY_1'}, inplace=True)
df.head()

#### Categorical Variables:
'SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'

#### Numerical Variables: 
'LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'

#### Response Variable: 
'default.payment.next.month'

In [None]:
df['EDUCATION'].value_counts()

In [None]:
df.loc[:,'EDUCATION'] = df.loc[:,'EDUCATION'].replace(0,5)
df.loc[:,'EDUCATION'] = df.loc[:,'EDUCATION'].replace(6,5)

In [None]:
df['EDUCATION'].value_counts()

### Information Extraction and Visualization

In [None]:
numeric = df[['LIMIT_BAL', 'AGE','BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]
numeric.head()

In [None]:
df['PAY_1'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in range(1,7):
    col_name = f"PAY_{i}"
    df[col_name] = le.fit_transform(df[col_name])
df['MARRIAGE'] = le.fit_transform(df['MARRIAGE'])    
df['SEX'] = le.fit_transform(df['SEX'])
df['EDUCATION'] = le.fit_transform(df['EDUCATION'])
df.head(10)

In [None]:
df.rename(columns={'default.payment.next.month':'Default'}, inplace=True)

In [None]:
bins = [20,30,40,50,60,80]
labels = ['20-29','30-39','40-49','50-59','60-79']
df['age-bin'] = pd.cut(df['AGE'],bins = bins,labels = labels,right = False)
#df.head(10)

In [None]:
colours = ['#FFF1C9','#F7B7A3','#EA5F89','#9B3192','#57167E']
plt.pie(df['age-bin'].value_counts(), autopct='%1.1f%%',labels=df['age-bin'].value_counts().index, explode = [0.05,0.05,.05,.05,.05], colors = colours)
plt.title('Proportions of different age groups')
#plt.savefig("Pie.jpg", dpi = 1000, format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

AgeBin_order = ['20-29', '30-39', '40-49', '50-59', '60-79']

ax = sns.countplot(data = df, x = 'age-bin', hue = 'Default', palette= 'cubehelix', order = AgeBin_order)

plt.xlabel("Age Group", fontsize= 12)
plt.ylabel("Number of Clients", fontsize= 12)
plt.ylim(0,10000)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.075, p.get_height()+300))
plt.title('Number of clients having default and non default credit cards for different age groups')
plt.legend(title = 'Default', labels = ['No', 'Yes'])
plt.show()
fig = ax.get_figure()
#fig.savefig('age-group.jpg', dpi = 800,format = 'jpg')

In [None]:
defaulters = sum(df['Default'])

In [None]:
def_age_per = df[df['Default'] == 1]['age-bin'].value_counts()*100/defaulters
plt.figure(figsize=(12,8))
ax = sns.barplot(x = def_age_per.index, y = def_age_per, data = df, palette = 'Paired', ci = None)
plt.ylabel('Percentages of Default', fontsize= 12)
plt.xlabel('Age-Groups', fontsize = 12)
plt.ylim(0,50)
for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.25, p.get_height()+0.5),fontsize=12)
plt.title('Percentages of clients having default credit cards for different age groups')
# fig = ax.get_figure()
# fig.savefig('age-perc.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x = "age-bin", y = "Default", data = df, palette = 'magma', ci = None)

plt.ylabel('Proportions of Default', fontsize= 12)
plt.ylim(0,0.5)
plt.xticks([0,1,2,3,4],['20-29', '30-39','40-49','50-59','60-79'], fontsize = 12)

for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.35, p.get_height()+0.03),fontsize=13)
plt.title('Proportions of clients having default credit cards for different age groups')
# fig = ax.get_figure()
# fig.savefig('age-prop.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

gen_order = [0,1]

ax = sns.countplot(data = df, x = 'SEX', hue = 'Default', palette='viridis', order = gen_order)

plt.xlabel("Gender", fontsize= 12)
plt.ylabel("Number of Clients", fontsize= 12)
plt.ylim(0,20000)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.075, p.get_height()+300))
ax.set_xticklabels(['Male','Female'])
plt.title('Number of clients having default and non default credit cards for different genders')
plt.legend(title = 'Default', labels = ['No', 'Yes'])
# fig = ax.get_figure()
# fig.savefig('gender-group.jpg', dpi = 800,format = 'jpg')

In [None]:
def_sex_per = df[df['Default'] == 1]['SEX'].value_counts()*100/defaulters
plt.figure(figsize=(12,8))
ax = sns.barplot(x = def_sex_per.index, y = def_sex_per, data = df, palette = 'magma', ci = None)
plt.ylabel('Percentages of Default', fontsize= 12)
plt.xlabel('Gender', fontsize = 12)
plt.ylim(0,100)
for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.35, p.get_height()+0.5),fontsize=12)
plt.title('Percentages of clients having default credit cards for different Gender')
plt.xticks([0,1],["Male","Female"], fontsize = 12)
plt.show()
# fig = ax.get_figure()
# fig.savefig('sex-perc.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

ax = sns.barplot(x = "SEX", y = "Default", data = df, palette = 'icefire', ci = None)

plt.ylabel('Proportions of Default', fontsize= 12)
plt.ylim(0,0.5)
plt.xticks([0,1],['Male', 'Female'], fontsize = 12)

for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.35, p.get_height()+0.03),fontsize=13)
plt.title('Proportions of clients having default credit cards for different genders')
plt.show()
# fig = ax.get_figure()
# fig.savefig('gender-prop.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

edu_order = [0,1,2,3,4]

ax = sns.countplot(data = df, x = 'EDUCATION', hue = 'Default', palette='Spectral', order = edu_order)

plt.xlabel("Education", fontsize= 12)
plt.ylabel("Number of Clients", fontsize= 12)
plt.ylim(0,15000)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.075, p.get_height()+300))
ax.set_xticklabels(['Grad school','University','High School','Others','Unknown'])
plt.title('Number of clients having default and non default credit cards for different educational levels')
plt.legend(title = 'Default', labels = ['No', 'Yes'])
# fig = ax.get_figure()
# fig.savefig('education-group.jpg', dpi = 800,format = 'jpg')

In [None]:
def_ed_per = df[df['Default'] == 1]['EDUCATION'].value_counts()*100/defaulters
plt.figure(figsize=(12,8))
ax = sns.barplot(x = def_ed_per.index, y = def_ed_per, data = df, palette = 'flare', ci = None)
plt.ylabel('Percentages of Default', fontsize= 12)
plt.xlabel('Education Level', fontsize = 12)
plt.ylim(0,100)
for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.3, p.get_height()+0.5),fontsize=12)
plt.title('Percentages of clients having default credit cards for different Education Level')
plt.xticks([0,1,2,3,4],["Grad School","University", "High School", "Others", "Unknown"], fontsize = 12)
plt.show()
# fig = ax.get_figure()
# fig.savefig('edu-perc.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

ax = sns.barplot(x = "EDUCATION", y = "Default", data = df, palette = 'rocket', ci = None)

plt.ylabel('Proportions of Default', fontsize= 12)
plt.ylim(0,0.5)
plt.xticks([0,1,2,3,4],['Grad school','University','High School','Others','Unknown'], fontsize = 12)

for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.35, p.get_height()+0.03),fontsize=13)
plt.title('Proportions of clients having default credit cards for different educational levels')
# fig = ax.get_figure()
# fig.savefig('education-prop.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

mar_order = [0,1,2,3]

ax = sns.countplot(data = df, x = 'MARRIAGE', hue = 'Default',palette = 'mako', order = mar_order)

plt.xlabel("Marital Status", fontsize= 12)
plt.ylabel("Number of Clients", fontsize= 12)
plt.ylim(0,20000)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.075, p.get_height()+300))
ax.set_xticklabels(['Unknown','Married','Single','Divorce'])
plt.title('Number of clients having default and non default credit cards for different marital status')
plt.legend(title = 'Default', labels = ['No', 'Yes'])
# fig = ax.get_figure()
# fig.savefig('marital-group.jpg', dpi = 800,format = 'jpg')

In [None]:
def_mar_per = df[df['Default'] == 1]['MARRIAGE'].value_counts()*100/defaulters
plt.figure(figsize=(12,8))
ax = sns.barplot(x = def_mar_per.index, y = def_mar_per, data = df, palette = 'magma', ci = None)
plt.ylabel('Percentages of Default', fontsize= 12)
plt.xlabel('Marital Status', fontsize = 12)
plt.ylim(0,100)
for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.35, p.get_height()+0.5),fontsize=12)
plt.title('Percentages of clients having default credit cards for different Marital Status')
plt.xticks([0,1,2,3],["Unknown","Married", "Single","Divorce"], fontsize = 12)
plt.show()
# fig = ax.get_figure()
# fig.savefig('mar-perc.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

ax = sns.barplot(x = "MARRIAGE", y = "Default", data = df, palette = 'Spectral', ci = None)

plt.ylabel('Proportions of Default', fontsize= 12)
plt.ylim(0,0.5)
plt.xticks([0,1,2,3],['Unknown', 'Married','Single','Divorce'], fontsize = 12)

for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.35, p.get_height()+0.03),fontsize=13)
plt.title('Proportions of clients having default and non default credit cards for different mar')
plt.title('Proportions of clients having default credit cards for different marital status')
# fig = ax.get_figure()
# fig.savefig('marital-prop.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

ax = sns.barplot(x = "SEX", y = "Default", hue = "MARRIAGE", data = df, palette = 'icefire', ci = None)

plt.ylabel("Proportions of Default", fontsize= 12)
plt.ylim(0,0.5)
plt.xticks([0,1],['Male', 'Female'], fontsize = 12)

for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.06, p.get_height()+0.03),fontsize=12)
plt.title('Proportions of clients having default credit cards for different genders and marital status')
plt.legend(title = 'Marital Status', labels = ['Unknown', 'Married', 'Single', 'Divorce'])
# fig = ax.get_figure()
# fig.savefig('gender-marital-prop.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

ax = sns.barplot(x = "SEX", y = "Default", hue = "EDUCATION", data = df, palette = 'magma', ci = None)

plt.ylabel("Proportions of Default", fontsize= 12)
plt.ylim(0,0.5)
plt.xticks([0,1],['Male', 'Female'], fontsize = 12)

for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.06, p.get_height()+0.03),fontsize=12)
plt.title('Proportions of clients having default and non default credit cards for different genders and educational levels')
plt.legend(title = 'Education level', labels = ['Graduate School', 'University', 'High School', 'Others', 'Unknown'])
# fig = ax.get_figure()
# fig.savefig('gender-education-prop.jpg', dpi = 800,format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

sns.boxplot(x = "SEX", y = "LIMIT_BAL",data = df, palette = 'viridis')

plt.ticklabel_format(style='plain', axis='y')     
plt.xticks([0,1],['Male', 'Female'], fontsize = 12)
#plt.savefig('Box-gender-bal.jpg', dpi = 800, format = 'jpg')

In [None]:
plt.figure(figsize=(14,8))

sns.boxplot(x = "EDUCATION", y = "LIMIT_BAL", data = df, palette = 'cubehelix')

plt.ticklabel_format(style='plain', axis='y')   
plt.xticks([0,1,2,3,4],['Grad School','University','High School','Others','Unknown'], fontsize = 12)
#plt.savefig('Box-edu-bal.jpg', dpi = 800, format = 'jpg')

In [None]:
plt.figure(figsize=(14,8))

sns.boxplot(x = "MARRIAGE", y = "LIMIT_BAL", data = df, palette = 'flare')

plt.ticklabel_format(style='plain', axis='y')    
plt.xticks([0,1,2,3],['Unknown', 'Married', 'Single', 'Divorce'], fontsize = 12)
#plt.savefig('Box-marriage-bal.jpg', dpi = 800, format = 'jpg')

In [None]:
plt.figure(figsize=(14,8))

sns.boxplot(x = "age-bin", y = "LIMIT_BAL",data = df, palette = 'Spectral', order = AgeBin_order)

plt.ticklabel_format(style='plain', axis='y')   
plt.xlabel("Age Group", fontsize= 12)
#plt.savefig('Box-age-bal.jpg', dpi = 800, format = 'jpg')

In [None]:
plt.figure(figsize=(14,8))

sns.boxplot(x = "EDUCATION", y = "LIMIT_BAL", hue = 'SEX', data = df, palette = 'rocket')

plt.ticklabel_format(style='plain', axis='y') 
plt.xticks([0,1,2,3,4],['Grad School','University','High School','Others','Unknown'], fontsize = 11)
#plt.savefig('Box-edu-sex-bal.jpg', dpi = 800, format = 'jpg')

In [None]:
plt.figure(figsize=(14,8))

sns.boxplot(x = "MARRIAGE", y = "LIMIT_BAL", hue = 'SEX', data = df, palette = 'crest')

plt.ticklabel_format(style='plain', axis='y')   
plt.xticks([0,1,2,3],['Unknown', 'Married', 'Single', 'Divorce'], fontsize = 12)
#plt.savefig('Box-marriage-sex-bal.jpg', dpi = 800, format = 'jpg')

In [None]:
plt.figure(figsize=(15,8))

sns.boxplot(x = "age-bin", y = "LIMIT_BAL", hue = 'SEX', data = df, palette = 'rocket', order = AgeBin_order, )

plt.ticklabel_format(style='plain', axis='y')   
plt.xlabel("Age Group", fontsize= 12)
#plt.savefig('Box-age-sex-bal.jpg', dpi = 800, format = 'jpg')

In [None]:
plt.figure(figsize=(12,8))

sns.boxplot(x = "Default", y = "LIMIT_BAL", hue = 'SEX', data = df, palette = 'mako')

plt.ticklabel_format(style='plain', axis='y')    
plt.xticks([0,1],['No','Yes'], fontsize = 12)
#plt.savefig('Box-default-sex-bal.jpg', dpi = 800, format = 'jpg')

In [None]:
import scipy.stats.distributions as dst

In [None]:
#checking significance of difference of proportion of male and female defaulters:
significance=0.05
male_default=df.iloc[np.where((df['SEX']== 0)&(df['Default']==1))].shape[0]
tot_m=df.iloc[np.where(df['SEX']==0)].shape[0]
female_default=df.iloc[np.where((df['SEX']==1)&(df['Default']==1))].shape[0]
tot_f=df.iloc[np.where(df['SEX']==1)].shape[0]
#H0:p1=p2 vs H1:not H0
p1=male_default/tot_m
p2=female_default/tot_f
p=(male_default+female_default)/(tot_m+tot_f)
z=(p1-p2)/np.sqrt(p*(1-p)*((1/tot_m)+(1/tot_f)))
p_value=2*(1-dst.norm.cdf(abs(z)))
print('p value :{}'.format(p_value))
if p_value > significance:
   print ("Fail to reject the null hypothesis")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")


#### Hence, Gender is a significant attribute.  

In [None]:
#checking significance of difference of proportion of married and unmarried defaulters:
significance=0.05
ma_default=df.iloc[np.where((df['MARRIAGE']==1)&(df['Default']==1))].shape[0]
tot_ma=df.iloc[np.where(df['MARRIAGE']==1)].shape[0]
un_default=df.iloc[np.where((df['MARRIAGE'].isin([2,3]))&(df['Default']==1))].shape[0]
tot_un=df.iloc[np.where(df['MARRIAGE'].isin([2,3]))].shape[0]
#H0:p1=p2 vs H1:not H0
p1=ma_default/tot_ma
p2=un_default/tot_un
p=(ma_default+ma_default)/(tot_ma+tot_un)
z=(p1-p2)/np.sqrt(p*(1-p)*((1/tot_ma)+(1/tot_un)))
p_value=2*(1-dst.norm.cdf(abs(z)))
print('p value :{}'.format(p_value))
if p_value > significance:
   print ("Fail to reject the null hypothesis")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")

#### Marriage status is a significant attribute.

In [None]:
#checking significance of difference of proportion of age between (35,60) and age<35 defaulters:
significance=0.05
age1_default=df.iloc[np.where((df['AGE'].between(35,60,inclusive=True))&(df['Default']==1))].shape[0]
tot_age1=df.iloc[np.where(df['AGE'].between(35,60,inclusive=True))].shape[0]
age2_default=df.iloc[np.where((df['AGE']<35)&(df['Default']==1))].shape[0]
tot_age2=df.iloc[np.where(df['AGE']<35)].shape[0]
#H0:p1=p2 vs H1:not H0
p1=age1_default/tot_age1
p2=age2_default/tot_age2
p=(age1_default+age2_default)/(tot_age1+tot_age2)
z=(p1-p2)/np.sqrt(p*(1-p)*((1/tot_age1)+(1/tot_age2)))
p_value=2*(1-dst.norm.cdf(abs(z)))
print('p value :{}'.format(p_value))
if p_value > significance:
   print ("Fail to reject the null hypothesis")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")

#### Age is an important factor.

In [None]:
#checking significance of difference of proportion of limit_bal between (10k,100k) and limit_bal>100k defaulters:
significance=0.05
B1_default=df.iloc[np.where((df['LIMIT_BAL'].between(10000,100000,inclusive=True))&(df['Default']==1))].shape[0]
tot_B1=df.iloc[np.where(df['LIMIT_BAL'].between(10000,100000,inclusive=True))].shape[0]
B2_default=df.iloc[np.where((df['LIMIT_BAL']>100000)&(df['Default']==1))].shape[0]
tot_B2=df.iloc[np.where(df['LIMIT_BAL']>100000)].shape[0]
#H0:p1=p2 vs H1:not H0
p1=B1_default/tot_B1
p2=B2_default/tot_B2
p=(B1_default+B2_default)/(tot_B1+tot_B2)
z=(p1-p2)/np.sqrt(p*(1-p)*((1/tot_B1)+(1/tot_B2)))
p_value=2*(1-dst.norm.cdf(abs(z)))
print('p value :{}'.format(p_value))
if p_value > significance:
   print ("Fail to reject the null hypothesis")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")

#### Limit balance groups are also significant in prediction purposes.

In [None]:
#checking significance of difference of proportion of education :high school and graduate and above defaulters:
significance=0.05
B1_default=df.iloc[np.where((df['EDUCATION'].isin([1,2]))&(df['Default']==1))].shape[0]
tot_B1=df.iloc[np.where(df['EDUCATION'].isin([1,2]))].shape[0]
B2_default=df.iloc[np.where((df['EDUCATION'].isin([3,4]))&(df['Default']==1))].shape[0]
tot_B2=df.iloc[np.where(df['EDUCATION'].isin([3,4]))].shape[0]
#H0:p1=p2 vs H1:not H0
p1=B1_default/tot_B1
p2=B2_default/tot_B2
p=(B1_default+B2_default)/(tot_B1+tot_B2)
z=(p1-p2)/np.sqrt(p*(1-p)*((1/tot_B1)+(1/tot_B2)))
p_value=2*(1-dst.norm.cdf(abs(z)))
print('p value :{}'.format(p_value))
if p_value > significance:
   print ("Fail to reject the null hypothesis")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")

#### Education levels are significant attribute.

### Checking Correlation Plot to find any association between the continuous features.

In [None]:
f,ax = plt.subplots(figsize=(20, 20))
mask=np.triu(np.ones_like(numeric.corr()))
sns.heatmap(numeric.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax,cmap="YlGnBu") #, mas = mask)
plt.title("Correlation plot of all features ")
#plt.savefig('corplot.png', dpi = 1000, format = 'png')

In [None]:
df.drop(columns=['ID', 'BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6','age-bin'], inplace=True)

In [None]:
df.head()

### Model Fitting

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve

In [None]:
X = df.iloc[:,0:18]
y = df.loc[:,['Default']]
X.columns

In [None]:
print(X.shape)
X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.30, random_state=42)
X_cv, X_test, y_cv, y_test = train_test_split(X_rest, y_rest, test_size=1/3, random_state=42)
#print(X_cv.shape)

In [None]:
print(X_rest.shape)

In [None]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train,y_train)
print(X_res.shape)

In [None]:
y_res.value_counts()/len(y_res)

In [None]:
LR = LogisticRegression()
LR.fit(X_res, y_res)
y_plr = LR.predict(X_cv)

In [None]:
conf_matrix = confusion_matrix(y_true=y_cv, y_pred = y_plr)
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.suptitle('Confusion Matrix', fontsize=14)
plt.title('Logistic Regression', fontsize=12)

plt.show()

In [None]:
print("Accuracy:",metrics.accuracy_score(y_cv, y_plr))
# print(metrics.f1_score(y_test, y_plr))
# print(roc_auc_score(y_test, y_plr))

In [None]:
ct = DecisionTreeClassifier(criterion="entropy", max_depth = 7)
ct.fit(X_res, y_res)
y_pdt = ct.predict(X_cv)

In [None]:
conf_matrix = confusion_matrix(y_true=y_cv, y_pred = y_pdt)
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.suptitle('Confusion Matrix', fontsize=14)
plt.title('Decision Tree', fontsize=12)
plt.show()

In [None]:
print("Accuracy:",metrics.accuracy_score(y_cv, y_pdt))
# print(f1_score(y_test, y_pdt))
# print(roc_auc_score(y_test, y_pdt))

In [None]:
rf = RandomForestClassifier(n_estimators= 100)
rf.fit(X_res, y_res)
y_prf = rf.predict(X_cv)

In [None]:
conf_matrix = confusion_matrix(y_true=y_cv, y_pred = y_prf)
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.suptitle('Confusion Matrix', fontsize=14)
plt.title('Random Forest', fontsize=12)
plt.show()

In [None]:
print("Accuracy:",metrics.accuracy_score(y_cv, y_prf))
# print(f1_score(y_test, y_prf))
# print(roc_auc_score(y_test, y_prf))

Random Forest is the best model

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred = y_pred)
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.suptitle('Confusion Matrix', fontsize=14)
plt.title('Random Forest', fontsize=12)
plt.show()

In [None]:
y_pred_prob = rf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Random Forest')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve')
plt.show()
