# Data Wrangling

In [1]:
# Import required package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Assign url of file: url
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls'

# Read in all sheets of Excel file and load into xl 
# we skipped the first row because the data had two rows with column labels
#xl = pd.read_excel(url, sheet_name=None, skiprows = [0])
xl = pd.read_excel('default of credit card clients.xls', skiprows = [0])

FileNotFoundError: [Errno 2] No such file or directory: 'default of credit card clients.xls'

In [None]:
xl.head()

In [None]:
#convert the xl OrderedDict to a pandas dataframe
#df = pd.DataFrame(xl['Data'])
df = xl

In [None]:
df.columns

In [None]:
#check the dataframe
df.head()

In [None]:
#create a new list of column label to rename the columns of dataframe
new_columns = ['id','limit_balance', 'gender', 'education','marital_status', 'age','status_september','status_august','status_july','status_june','status_may','status_april','balance_september','balance_august','balance_july','balance_june','balance_may','balance_april','paid_september','paid_august','paid_july','paid_june','paid_may','paid_april','default_payment']
#change the column labels
df.columns = new_columns

In [None]:
#to check the column labels
df.head()

In [None]:
#to get information about the data
df.info()

In [None]:
#Using the boxplot to check for outliers
df.boxplot('age',figsize=(10,5))

In [None]:
#Create a list of column labels for different months for balance amount and paid amount
balance = ['balance_september','balance_august','balance_july','balance_june','balance_may','balance_april']
paid = ['paid_september','paid_august','paid_july','paid_june','paid_may','paid_april']

In [None]:
#we can use boxplot to check for outliers from balance columns for each month
df.boxplot(balance, figsize=(10,10))

In [None]:
#Draw boxplot to detect outliers from paid column for each month
df.boxplot(paid, figsize=(10,10))

In [None]:
#find min, max, median, 1st and 3rd quartile values for balance column of each month 
df[balance].describe()

In [None]:
#find min, max, median, 1st and 3rd quartile values for paid column of each month
df[paid].describe()

# I created a new dataframe to update and modify bad data plus the outliers values. I   
The bad data are replaced with their best possible values.

The outliers are updated with their closest lower and upper quartile range value.

In [None]:
new_df = df.copy()

In [None]:
#to check for bad data in the education column
new_df[new_df['education']>4]['education'].count()

In [None]:
new_df[new_df['education']<1]['education'].count()

In [None]:
#to change the bad values 5, 6 and 0 into 4 from education column
for i in new_df[(new_df['education']>4)|(new_df['education']<1)]['education'].index:
    new_df.at[i,'education'] = 4

In [None]:
#to check if there is any bad data in the marital_status column
new_df[(new_df['marital_status']<1)|(new_df['marital_status']>3)]['marital_status'].head()

In [None]:
#to change the bad value 0 from marital_status column
for i in new_df[new_df['marital_status']==0]['marital_status'].index:
    new_df.at[i,'marital_status'] = 3

In [None]:
#to check if there is any bad data in the status columns, looking at the min and max values
status=['status_september','status_august','status_july','status_june','status_may','status_april']
new_df[status].describe()

In [None]:
#to check what was the client's status was before it changed to -2
#we can see that the client's pay status was -1 before it changed to -1.
#we will change the value from -2 to -1 
new_df[(new_df['status_september']<-1)|(new_df['status_september']==0)][status]

In [None]:
#to change the bad values -2 and 0 into -1 from status column
for month in status:
    for i in new_df[(new_df[month]<-1)|(new_df[month]==0)][month].index:
        new_df.at[i,month] = -1

In [None]:
#this for loop will locate the outliers from the dataframe and update those values with their corresponding lower and upper limit
#iterate over the list named balance, which has the balance column labels
for month in balance:
    #calculate 3rd quartile
    q3 = new_df[month].quantile(0.75)
    #calculate 1st quartile
    q1 = new_df[month].quantile(0.25)
    #calculate interquartile range
    iqr = q3-q1
    lower = q1-(1.5*iqr)
    upper = q3+(1.5*iqr)
    lower_index = new_df[new_df[month]<=lower].index
    upper_index = new_df[new_df[month]>=upper].index
    for i in lower_index:
        new_df.at[i,month] = lower
    for i in upper_index:
        new_df.at[i,month] = upper

In [None]:
#check to if the vlaues has been updated
new_df[balance].describe()

In [None]:
max(new_df[balance].max())

In [None]:
min(new_df[balance].min())

In [None]:
#this for loop will locate the outliers from the dataframe and update those values with their corresponding lower and upper limit
#iterate over the list named paid, which has the paid column labels
for month in paid:
    #calculate 3rd quartile
    q3 = new_df[month].quantile(0.75)
    #calculate 1st quartile
    q1 = new_df[month].quantile(0.25)
    #calculate interquartile range
    iqr = q3-q1
    lower = q1-(1.5*iqr)
    upper = q3+(1.5*iqr)
    lower_index = new_df[new_df[month]<=lower].index
    upper_index = new_df[new_df[month]>=upper].index
    for i in lower_index:
        new_df.at[i,month] = lower
    for i in upper_index:
        new_df.at[i,month] = upper

In [None]:
#to check the update
new_df[paid].describe()

In [None]:
max(new_df[paid].max())

In [None]:
#to check the updated value using boxplot
new_df.boxplot(balance,figsize=(10,5))

In [None]:
#to check the updated values in those column after dealing with outliers
new_df.boxplot(paid, figsize=(10,5))

# Data Story

In [None]:
#to find out total number of 1 default and 0 non-default 
new_df['default_payment'].value_counts()

In [None]:
#to calculate the percentage of 1 default and 0 non-default
default_rate = new_df['default_payment'].value_counts() * 100 / len(new_df)
default_rate

# After we wrangled and cleaned the dataset, we started to explore the data in detail. The first step was to see the count and distributions of different variables from the dataset.

# ● How many cardholders are defaulters?
We found that the  77.88% of the cardholders(23,364) did not default and 22.12% of the cardholders (6,636) default.

In [None]:
#draw a bar plot to visualize the default payment
import matplotlib.patches as mpatches
plt.subplot(1,2,1)
default_rate.plot(kind='bar',title='Percentage Distribution of Default Payment',figsize=(10,5))
ND = mpatches.Patch(color ='blue', label = '0-Non Default')
DT = mpatches.Patch(color ='orange', label = '1-Default')
plt.legend(handles=[ND, DT], loc=0)
plt.xlabel('default status')
plt.ylabel('Population Percentage')
plt.subplot(1,2,2)
new_df['default_payment'].value_counts().plot(kind='bar',title='Count of Default Payment')
plt.xlabel('default_status')
plt.ylabel('Number of card-holder')
ND = mpatches.Patch(color ='blue', label = '0-Non Default')
DT = mpatches.Patch(color ='orange', label = '1-Default')
plt.legend(handles=[ND, DT], loc=0)
plt.tight_layout()
plt.savefig('Default_Payment_distribution.jpg')

In [None]:
#to find out total number of 1 male and 2 female
#there is more 2 female 
new_df['gender'].value_counts()

In [None]:
#calculate the gender distribution
gender_rate = new_df['gender'].value_counts() * 100 / len(new_df)
gender_rate

# ● How are the cardholders divided by gender? 
We have 60% female and 40% male clients.

In [None]:
#to calculate the percentage and count of gender
plt.subplot(1,2,1)
#draw a bar plot to see the total number of 1 male and 2 female card holder
gender_rate.plot(kind='bar',title='Percentage Distribution of Gender',figsize=(10,5))
ML = mpatches.Patch(color ='orange', label = '1-Male')
FM = mpatches.Patch(color ='blue', label = '2-Female')
plt.legend(handles=[ML, FM], loc=0)
plt.ylabel('Population Percentage')
plt.subplot(1,2,2)
new_df.gender.value_counts().plot(kind='bar',title='Count of Gender')
ML = mpatches.Patch(color ='orange', label = '1-Male')
FM = mpatches.Patch(color ='blue', label = '2-Female')
plt.legend(handles=[ML, FM], loc=0)
plt.ylabel('Number of Card-holder')
plt.tight_layout()
plt.savefig('gender_distribution.jpg')

In [None]:
#to find out the total number of cardholder with different education level
#2 University level is high, and 4 others is low
new_df['education'].value_counts()

In [None]:
#to calculate the percentage education distribution
education_rate = new_df['education'].value_counts() * 100 / len(new_df)
education_rate

# ● What are the education level, and Which education level does the most of the cardholders belong to? 
Most of our cardholder have University level education for their highest level of education. We have 35% with Graduate level education, 46% with University level, 16% with High School level, and 1.5% with Others as level of education.

In [None]:
#draw a bar plot to see what education level most of the card holder have
#University level  is the highest
plt.subplot(1,2,1)
education_rate.plot(kind='bar',title='Percentage Distribution of Education Level',figsize=(10,5))
GD = mpatches.Patch(color ='Orange', label = '1-Graduate')
UN = mpatches.Patch(color ='blue', label = '2-University')
HS = mpatches.Patch(color ='green', label = '3-High School')
OT = mpatches.Patch(color ='red', label = '4-Other')
plt.legend(handles=[GD, UN, HS, OT], loc=0)
plt.ylabel('Population Percentage')
plt.subplot(1,2,2)
new_df.education.value_counts().plot(kind='bar',title='Count of Education Level',figsize=(10,5))
GD = mpatches.Patch(color ='Orange', label = '1-Graduate')
UN = mpatches.Patch(color ='blue', label = '2-University')
HS = mpatches.Patch(color ='green', label = '3-High School')
OT = mpatches.Patch(color ='red', label = '4-Other')
plt.legend(handles=[GD, UN, HS, OT], loc=0)
plt.ylabel('Number of Card-holder')
plt.tight_layout()
plt.savefig('education_distribution.jpg')

In [None]:
#to find out the count of different martial status,1 married, 2 single, 3 others from the dataset 
#2 i.e single is highest
new_df['marital_status'].value_counts()

In [None]:
#to calculate the percentage education distribution
marital_status_rate = new_df['marital_status'].value_counts() * 100 / len(new_df)
marital_status_rate

# ● How many cardholders are married and how many are single?
We have 53% married, 45% single, and rest as others.

In [None]:
plt.subplot(1,2,1)
#draw a bar plot to see the population of married, single, and others
#single is highest, married is second and others is lowest
marital_status_rate.plot(kind='bar', title='Percentage Distribution of Marital_Status',figsize=(10,5))
MR = mpatches.Patch(color ='orange', label = '1-Married')
SG = mpatches.Patch(color ='blue', label = '2-Single')
O = mpatches.Patch(color ='green', label = '3-Other')
plt.legend(handles=[MR, SG, O], loc=0)
plt.ylabel('Population Percentage')
plt.subplot(1,2,2)
new_df['marital_status'].value_counts().plot(kind='bar', title='Count of Marital_Status',figsize=(10,5))
MR = mpatches.Patch(color ='orange', label = '1-Married')
SG = mpatches.Patch(color ='blue', label = '2-Single')
O = mpatches.Patch(color ='green', label = '3-Other')
plt.legend(handles=[MR, SG, O], loc=0)
plt.ylabel('Number of Card-holder')
plt.tight_layout()
plt.savefig('marital_status_distribution.jpg')

# ● What age group is the majority of the cardholders?
Most of our carholders are of age group 20 to 40. This exploration can provide us with the demographic of different variables.

In [None]:
#we can also draw a histogram to see our age group in the dataset
plt.subplot(1,2,1)
new_df['age'].hist(bins=6,figsize=(10,5),normed=1,grid=False)
plt.title('Distribution of Age')
plt.xlabel('age')
plt.ylabel('Population')
plt.subplot(1,2,2)
new_df['age'].hist(bins=6, grid=False)
plt.title('Count of Age-Group')
plt.xlabel('age')
plt.ylabel('Number of Card-holder')
plt.tight_layout()
plt.savefig('age_histogram.jpg')

In [None]:
#group the data by gender and find out the default rate
total_defaults = np.sum(new_df['default_payment']==1)
group_gender = new_df.groupby('gender')['default_payment'].sum().reset_index(name='default')
group_gender

# The main focus of this project is to create different Machine Learning Models to predict default, so let's find out some insights using different data visualization. We drew barplots to compare different variables with the default cardholders. From each plot, we learned which sub-variable effects default.

In [None]:
#Now we will commpare default rate with Gender
#it shows that Female is the lowest to default
#married and single both are high in number to default
group_gender.plot(kind='bar',x='gender', y='default',title='Gender with Default',figsize=(10,5))
ML = mpatches.Patch(color ='blue', label = '1-Male')
FM = mpatches.Patch(color ='orange', label = '2-Female')
plt.legend(handles=[ML, FM], loc=0)
plt.ylabel('default')
plt.tight_layout()
plt.savefig('gender_default_bar.jpg')

In [None]:
group_education = new_df.groupby(['education'])['default_payment'].sum().reset_index(name='default')
group_education

# We found that University level cardholders default more than other education level.

In [None]:
#Now we will commpare default with all marital status
#it shows that others is the lowest to default
#married and single both are high in number to default
group_education.plot(kind='bar',x='education',y='default',figsize=(10,5),title='Education Level with Default')
G = mpatches.Patch(color ='blue', label = '1-Graduate')
U = mpatches.Patch(color ='orange', label = '2-University')
H = mpatches.Patch(color ='green', label = '3-High School')
OT = mpatches.Patch(color ='red', label = '4-Other')
plt.legend(handles=[G, U, H, OT], loc=0)
plt.ylabel('default')
plt.tight_layout()
plt.savefig('education_default_bar.jpg')

In [None]:
#group the data by marital_status to see who has maximum default
group_marital = new_df.groupby(['marital_status'])['default_payment'].sum().reset_index(name='default')
group_marital

# From marital_status with default plot, we found that both married and single have very close number of default, and others have a very low default.

In [None]:
#Now we will commpare default with all marital status
#it shows that others is the lowest to default
#married and single both are high in number to default
group_marital.plot(kind='bar',x='marital_status',y='default',title='Marital Status with Default',figsize=(10,5))
MR = mpatches.Patch(color ='blue', label = '1-Married')
SG = mpatches.Patch(color ='orange', label = '2-Single')
O = mpatches.Patch(color ='green', label = '3-Other')
plt.legend(handles=[MR, SG, O], loc=0)
plt.ylabel('default')
plt.tight_layout()
plt.savefig('marital_status_default_bar.jpg')

In [None]:
#to work with age group, create a list of age group to use as a bins
#create a new column with column label 'age_bin'
bins = [20,30,40,50,60,70,80]
new_df['age_bin']= pd.cut(new_df['age'], bins)

In [None]:
#find out the population if different age group
#age group 20 to 30 is the highest, and 30-40 is the second highest
new_df['age_bin'].value_counts()

In [None]:
#Find the percentage of different age_group
agegroup_rate = new_df['age_bin'].value_counts() * 100 / len(new_df)
agegroup_rate

In [None]:
#now group the data by age_bin
group_age = new_df.groupby('age_bin')['default_payment'].sum().reset_index(name='default')
group_age

# Age group with default shows that age group 20 to 30 default more than any other age-group.

In [None]:
#Now,default_payment with age groups
#more number of age-group(20-30) clients default
group_age.plot(kind='bar',x='age_bin',y='default',title='Age Group with Default',figsize=(10,5))
plt.ylabel('default')
plt.tight_layout()
plt.savefig('age_group_default.jpg')

In [None]:
#now group the data by age to find what age is the majority of defaulter
group_age1 = new_df.groupby('age')['default_payment'].sum().reset_index(name='default')
group_age1.sort_values(by="default", ascending=False).head()

In [None]:
#Now,default_payment with age groups
#more number of age-group(20-30) clients default
group_age1.plot(kind='bar',x='age',y='default',title='Age with Default',figsize=(10,5))
plt.ylabel('default')
plt.tight_layout()
plt.savefig('age_default.jpg')

# We found that some cardholders have negative balance. This means that some cardholders are paying the bank more than their balance or some transaction of purchase may have been refunded to the credit card.

In [None]:
#to check the updated value using boxplot
new_df.boxplot(balance,figsize=(10,5))
plt.savefig('balance_box.jpg')

# When we compared balance column with paid column for the month of september, we saw that there is a positive linear relationship between the balance and paid columns except for some balances. Looking at the plot, those balances may be paid by automatic payment every month.

In [None]:
g = sns.FacetGrid(new_df, row='education', col="default_payment", hue='gender', size=4)
g.map(plt.scatter,  "balance_september", "paid_september", alpha=0.5, edgecolor='k', linewidth=0.5, s=new_df['marital_status']*10)
fig = g.fig 
fig.set_size_inches(20, 20)
fig.subplots_adjust(top=0.85, wspace=0.3)
fig.suptitle('Paid_september - balance_september - Education - default_payment - gender - marital_status', fontsize=14)
l = g.add_legend(title='Gender')
plt.savefig('paid_balance_scatter.jpg')

In [None]:
#group the data by marital_status and gender to see who has maximum default
group_marital_gender = new_df.groupby(['marital_status','gender'])['default_payment'].sum().reset_index(name='default')
group_marital_gender

# From the analysis and visualizing the dataset, we found that the maximum number of defaulter are female, with University level education, and age between 20 and 30.

In [None]:
plt.figure(figsize=(10,5))
#draw a barplot to check if female has the higher default rate
sns.barplot(x='marital_status',y='default',hue='gender',data=group_marital_gender)
plt.title('Marital Status and Gender')
plt.tight_layout()
plt.savefig('marital_gender_default.jpg')

In [None]:
#now group the data by age_bin and gender
group_age_gender = new_df.groupby(['age_bin','gender'])['default_payment'].sum().reset_index(name='default')
group_age_gender

# When we compared male and female for each age group with default, we saw that more female of age group 20-30 default more, and both older male and older female after age group 50-80 default less.

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='age_bin',y='default',hue='gender',data=group_age_gender)
plt.title('Age Group and Gender')
plt.tight_layout()
plt.savefig('age_group_gender_default.jpg')

In [None]:
group_gender_education = new_df.groupby(['gender','education'])['default_payment'].sum().reset_index(name='default')
group_gender_education.set_index(['gender','education'],inplace=True)
group_gender_education.sort_values('default',ascending=False).head(3)

# When we group gender and education column, we see that 2,2 or Female with University level education has the maximum default.

In [None]:
group_gender_education['default'].plot(kind='bar',title='Gender and Education with Total Population',figsize=(10,5))
plt.ylabel('Default')
plt.tight_layout()
plt.savefig('gender_education_default.jpg')

In [None]:
group_gender_education_marital = new_df.groupby(['gender','education','marital_status'])['default_payment'].sum().reset_index(name='default')
group_gender_education_marital.set_index(['gender','education','marital_status'],inplace=True)
group_gender_education_marital.sort_values('default',ascending=False).head(3)

# When we compared gender, education and marital status with default, we found that a female, university level, and married cardholder has the maximum default.

In [None]:
group_gender_education_marital['default'].plot(kind='bar',title='Combined Gender, Education, Marital Status',figsize=(10,5))
plt.ylabel('Default')
plt.tight_layout()
plt.savefig('gender_education_marital_default.jpg')

# Inferential statistics

In [None]:
# Create two dataframes: m for male and f for female 
m = new_df[new_df.gender==1]
f = new_df[new_df.gender==2]

In [None]:
#t-statistics
#H0: p(m.default_payment) = p(pop_f.default_payment)
#Ha: p(m.default_payment) != p(pop_f.default_payment)

# calculate the sample mean of male and female sample default_payment
sample_p_m = np.mean(m.default_payment)
sample_p_f = np.mean(f.default_payment)
# calculate the difference of sample mean of male and female
diff_p = sample_p_m - sample_p_f
# calculate the size of sample male and female
n_m = len(m.default_payment)
n_f = len(f.default_payment)
# calculate the variance of male and female
variance_m = np.var(m.default_payment)
variance_f = np.var(f.default_payment)
#Calculate the t-value
t= diff_p / np.sqrt((variance_m/n_m)+(variance_f/n_f))
t

In [None]:
#calculate the degree of freedom
dof = n_m + n_f -2
dof

In [None]:
# We assume equal population variance
standard_error = np.sqrt(((n_m-1)*variance_m + (n_f-1)*variance_f) / (n_m+n_f-2))*(np.sqrt((1/n_m)+(1/n_f)))
#the critical t-value is 1.960 for degree of freedom 29998 and alpha 0.05
margin_of_error = 1.96* standard_error
margin_of_error

In [None]:
#Calculate the confidence interval for 95%
conf_int = [diff_p - margin_of_error, diff_p + margin_of_error]
conf_int

In [None]:
from scipy import stats

#calculation p-value for 2-tailed test. 
p_value = stats.t.sf(t, dof)*2
print('p-value is ',p_value)

The p-value is less than level of significance 0.05, so we reject the null hypothesis. 

In [None]:
# Now, use the bootstrap method
#Bootstrap replicate function to resample data and find mean/std
def bootstrap_replicate_1d(data, func):
    return func(np.random.choice(data, size=len(data)))

#draw many bootstrap replicates 
def draw_bs_reps(data, func, size=1):
    return np.array([bootstrap_replicate_1d(data, func) for _ in range(size)])

mean_diff = np.mean(m.default_payment) - np.mean(f.default_payment)
#get bootstrap replicates of data sets
bs_replicates_m = draw_bs_reps(m.default_payment, np.mean, size=10000)
bs_replicates_f = draw_bs_reps(f.default_payment, np.mean, size=10000)
#compute replicates of difference of means: bs_diff_replicates
bs_diff_replicates = bs_replicates_m - bs_replicates_f
conf_interval = np.percentile(bs_diff_replicates, [2.5, 97.5])
conf_interval

In [None]:
# To find the p-value
combined_mean = np.mean(new_df.default_payment)
#shift the samples
m_shifted = m.default_payment - np.mean(m.default_payment) + combined_mean
f_shifted = f.default_payment - np.mean(f.default_payment) + combined_mean
#get bootstrap replicates of shifted data sets
bs_replicates_m = draw_bs_reps(m_shifted, np.mean, size=10000)
bs_replicates_f = draw_bs_reps(f_shifted, np.mean, size=10000)
#compute replicates of difference of means:
bs_diff_replicates = bs_replicates_m - bs_replicates_f
#compute the p-value
p = np.sum(bs_diff_replicates >= mean_diff) / len(bs_diff_replicates)
p

'''P-value is less than 0.05, we will reject the null hypothesis. z-score 6.921 is more extreme than the threshold of 1.96, so we will reject the null hypothesis. The gender is one important factor for default payment.

# Chi-squared Test

# For: Gender

H0: Gender and default_payment are independent.

Ha: Gender and default_payment are dependent.  

alpha = 0.05

df = (rows-1)*(columns-1)
df = (2-1) * (2-1)
df = 1

critical chi squared value = 3.84146

In [None]:
chi_squared_test = new_df.groupby(['default_payment','gender']).count()['id'].unstack()
chi_squared_test


                  Gender		Male		    Female		Total

         default_Payment	

                     No		9015		    14349		23364 (0.7788)

            Expected NO	    9258.37	     14105.62
            
                    Yes		2873		    3763		6636 (0.2212)

           Expected Yes	   2629.62          4006.37

                  Total		11888 (0.3963)	18112 (0.6037)	30000

level of significance = 0.05

Chi squared value = (9015-9258.37)squared/9258.37 + (14349-14105.62)squared/14105.62 + (2873-2629.62)squared/2629.62 + (3763-4006.37)squared/4006.37

= 47.90

DF = 1 

In [None]:
from scipy.stats import chisquare

In [None]:
from scipy import stats

In [None]:
observed = [[9015, 14349], [2873, 3763]]
chi2, p, dof, expected = stats.chi2_contingency(observed)
print('chi2:',chi2)
print('DF:',dof)
print('p-value:',p)
print(expected)

The chi squared statistics is 47.71
P-value is <0.001
The result is significant at p<0.05
We reject the null hypothesis and suggest the alternative hypothesis.
The default_payment is dependent on gender.

# For Education:

H0: Education and default_payment are not related.

Ha: Education and default_payment are related. 

In [None]:
chi_squared_test_education = new_df.groupby(['default_payment','education']).count()['id'].unstack()
chi_squared_test_education

In [None]:
obs = [[8549,10700,3680,435],[2036,3330,1237,33]]
chi2, p, dof, expected = stats.chi2_contingency(obs)
print('chi2:',chi2)
print('DF:',dof)
print('p-value:',p)
print(expected)

The chi squared test p-value is < 0.001
The result is significant at p < 0.05
We reject the null hypothesis and suggest the alternative hypothesis.
The default_payment is dependent on education. 

# For Marital_status:

H0: Marital_status and default_payment are not related.

Ha: Marital_status and default_payment are related.

In [None]:
chi_squared_test_marital_status = new_df.groupby(['default_payment','marital_status']).count()['id'].unstack()
chi_squared_test_marital_status

In [None]:
obs = [[10453,12623,288],[3206,3341,89]]
chi2, p, dof, expected = stats.chi2_contingency(obs)
print('chi2:',chi2)
print('DF:',dof)
print('p-value:',p)
print(expected)

The chi squared test p-value is < 0.001
The result is significant at p < 0.05
We reject the null hypothesis and suggest the alternative hypothesis.
The default_payment is dependent on marital_status.

 # For Age-group:

H0: Age-group and default_payment are not related.

Ha: Age-group and default_payment are related.

In [None]:
chi_squared_test_age = new_df.groupby(['default_payment','age_bin']).count()['id'].unstack()
chi_squared_test_age

In [None]:
obs = [[8542,8524,4606,1493,189,10],[2471,2189,1399,504,68,5]]
chi2, p, dof, expected = stats.chi2_contingency(obs)
print('chi2:',chi2)
print('DF:',dof)
print('p-value:',p)
print(expected)

The chi squared test p-value is < 0.001
The result is significant at p < 0.05
We reject the null hypothesis and suggest the alternative hypothesis.
The default_payment is dependent on age.

# Machine Learning

Logistic Regression

Decision Tree

Gaussian Naive Bayes Classifier

Random Forest Classifier

Support Vector Machine

In [None]:
#create dummies features for all the categorical columns data
new_df = pd.get_dummies(new_df, columns=['gender','education', 'marital_status'], drop_first=True)

In [None]:
#for status of the client
pay_features = ['status_september','status_august','status_july','status_june', 'status_may','status_april',]
for p in pay_features:
    new_df.loc[new_df[p]<=0, p] = 0
    
new_df = new_df.drop('age_bin', axis=1)

In [None]:
#import necessary packages from scikit learn

#Import LogisticRegression
from sklearn.linear_model import LogisticRegression
#Import DecisionTree
from sklearn.tree import DecisionTreeClassifier
#Import GaussianNB
from sklearn.naive_bayes import GaussianNB
#Import RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
#Import Support Vector Machine
from sklearn import svm

# import cross_val_score to evaluate the score by cross-validation
from sklearn.model_selection import cross_val_score

# import train_test_split to split data into training and testing set
from sklearn.model_selection import train_test_split
# import different metrics we will use to evaluate the models
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler, NearMiss, CondensedNearestNeighbour
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalanceCascade, EasyEnsemble
from sklearn.ensemble import AdaBoostClassifier
import warnings
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from time import time
from scipy.stats import randint as sp_randint

In [None]:
#X is the features/data we use for our model (input data)
X = new_df.drop('default_payment',axis=1)
#scale all our data using robust scaler
robust_scaler = RobustScaler()
X = robust_scaler.fit_transform(X)
#y is the label of our data
y = new_df['default_payment']

In [None]:
#Create a train test split of the data with test size 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=123, stratify=y)

Our dataset has class imbalance which needs to be taken cared to build a better model which would not over/underfit when learning for classification. Let us handle class imbalance before we start building different model.

# Taking care of Class Imbalance
Balancing the class weight

Modify the dataset (resampling)

Ensemble methods

In [None]:
#Create a dataframe to hold the recall score from different resampling techniques
resampled_score = pd.DataFrame(columns=['method','recall','precision','f1_score','AUC'])

In [None]:
# Instantiate a logistic regression classifier: logreg
model = LogisticRegression(solver='lbfgs')

# Setup the hyperparameter grid
param_dist = {'C': [0.001,0.01,0.1,1,10,100,1000]}

# Instantiate the RandomizedSearchCV object: logreg_cv
model_cv = RandomizedSearchCV(model, param_distributions=param_dist, cv=5, n_iter=4, random_state=0)

start = time()

# Fit it to the data
model_cv.fit(X_train,y_train)

print('RandomizedSearchCV took %.2f seconds for %d candidates'
      ' parameter settings.' % ((time() - start), model_cv.n_iter))

y_pred_test = model_cv.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

resampled_score = resampled_score.append({'method': 'not resampled','recall':recall,'precision':precision, 'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression Parameters: {}'.format(model_cv.best_params_))
print('Best score is {0:.4f}'.format(model_cv.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
print(classification_report(y_test, y_pred_test))

# Balancing the class weight

In [None]:
# Instantiate a logistic regression classifier: logreg
model = LogisticRegression(class_weight='balanced', solver='lbfgs')

# Setup the hyperparameter grid
param_dist = {'C': [0.001,0.01,0.1,1,10,100,1000]}

# Instantiate the RandomizedSearchCV object: logreg_cv
balanced_model_cv = RandomizedSearchCV(model, param_distributions=param_dist, cv=5, n_iter=4, random_state=0)

start = time()

# Fit it to the data
balanced_model_cv.fit(X_train,y_train)

print('RandomizedSearchCV took %.2f seconds for %d candidates'
      ' parameter settings.' % ((time() - start), model_cv.n_iter))

y_pred_test = balanced_model_cv.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

resampled_score = resampled_score.append({'method': 'Balancing class weight','recall':recall,'precision':precision, 'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)


# Print the tuned parameters and score
print('Tuned Logistic Regression with Class weight balanced Parameters: {}'.format(balanced_model_cv.best_params_))
print('Best score is {0:.4f}'.format(balanced_model_cv.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
print(classification_report(y_test, y_pred_test))

# Now Undersampling

In [None]:
#Random Undersampling
us = RandomUnderSampler(ratio=0.5, random_state=1)
X_train_res, y_train_res = us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels before resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0**np.arange(-2,3)}

#cv = KFold(n_splits=5, shuffle=True, random_state=0)
clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)
#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

resampled_score = resampled_score.append({'method': 'RandomUnderSampler','recall':recall, 'precision':precision, 'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with RandomUnderSampler Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
#NEARMISS-1
us = NearMiss(ratio=0.5, version=1, random_state=1)
X_train_res, y_train_res = us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#add the recall score to the dataframe resampled_score
resampled_score = resampled_score.append({'method': 'NearMiss1','recall':recall, 'precision':precision, 'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with NearMiss1 Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
#NEARMISS-2
us = NearMiss(ratio=0.5, version=2, random_state=1)
X_train_res, y_train_res = us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#add the recall score to the dataframe resampled_score
resampled_score = resampled_score.append({'method': 'NearMiss2','recall':recall, 'precision':precision,'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with NearMiss2 Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
#Edited Nearest Neighbour
us = EditedNearestNeighbours(random_state=0)
X_train_res, y_train_res = us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#add the recall score to the dataframe resampled_score
resampled_score = resampled_score.append({'method': 'EditedNearestNeighbour','recall':recall, 'precision':precision, 'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with EditedNearestNeighbours Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
#Repeated Edited Nearest Neighbour
us = RepeatedEditedNearestNeighbours(random_state=0)
X_train_res, y_train_res = us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#add the recall score to the dataframe resampled_score
resampled_score = resampled_score.append({'method': 'RepeatedEditedNearestNeighbours','recall':recall,'precision':precision,'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with RepeatedEditedNearestNeighbours Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
#Tomek Link Removal
us = TomekLinks(random_state=0)
X_train_res, y_train_res = us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#add the recall score to the dataframe resampled_score
resampled_score = resampled_score.append({'method': 'TomekLinks','recall':recall, 'precision':precision, 'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with TomekLinks: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

# Oversampling methods

Number of minority class data will increase by copying the data we already have.

In [None]:
#Random Oversampling
os = RandomOverSampler(ratio=0.5,random_state=0)
X_train_res, y_train_res = os.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#add the recall score to the dataframe resampled_score
resampled_score = resampled_score.append({'method': 'RandomOverSampler','recall':recall, 'precision':precision, 'f1_score':f1score,'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with RandomOverSampler Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
#SMOTE
os = SMOTE(ratio=0.5, random_state=0)
X_train_res, y_train_res = os.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#add the recall score to the dataframe resampled_score
resampled_score = resampled_score.append({'method': 'SMOTE','recall':recall, 'precision':precision, 'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with SMOTE Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

Ensemble methods for handling class imabalance is combining two methods of handling class imbalance. 

# Combinations
SMOTE + Tomek Link Removal

SMOTE + ENN

In [None]:
#SMOTE + Tomek link removal
os_us = SMOTETomek(ratio=0.5, random_state=0)
X_train_res, y_train_res = os_us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#add the recall score to the dataframe resampled_score
resampled_score = resampled_score.append({'method': 'SMOTETomek','recall':recall, 'precision':precision, 'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with SMOTETomek Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
#SMOTE + ENN
os_us = SMOTEENN(ratio=0.5, random_state=0)
X_train_res, y_train_res = os_us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#add the recall score to the dataframe resampled_score
resampled_score = resampled_score.append({'method': 'SMOTEENN','recall':recall, 'precision':precision, 'f1_score':f1score, 'AUC':roc_auc}, ignore_index=True)

# Print the tuned parameters and score
print('Tuned Logistic Regression with SMOTEENN Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
resampled_score.sort_values(by='f1_score', ascending=False)

According to the F1 Score of the classification model, the SMOTE+ENN method to handle the class imbalance performed the best. So we will use SMOTE+ENN to handle class imbalance, then compare different models with each other.

In [None]:
#Preparing a dataframe for model analysis
#Data frame for evaluation metrics
smoteenn_resampled_metrics = pd.DataFrame(index=['roc_auc', 'accuracy','precision','recall','f1'],columns=['LogisticReg','DecisionTree','GaussianNB','RandomForest','SVM'])

# SMOTE+ENN with Logistic Regression

In [None]:
#SMOTE + ENN
os_us = SMOTEENN(ratio=0.5, random_state=0)
X_train_res, y_train_res = os_us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf = LogisticRegression(solver='lbfgs')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_metrics.loc['accuracy','LogisticReg'] = accuracy
smoteenn_resampled_metrics.loc['precision','LogisticReg'] = precision
smoteenn_resampled_metrics.loc['recall','LogisticReg'] = recall
smoteenn_resampled_metrics.loc['roc_auc','LogisticReg'] = roc_auc
smoteenn_resampled_metrics.loc['f1','LogisticReg'] = f1score

# Print different metrics score
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

# SMOTE+ENN with DecisionTree Classifier

In [None]:
#SMOTE + ENN
clf = DecisionTreeClassifier()

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_metrics.loc['accuracy','DecisionTree'] = accuracy
smoteenn_resampled_metrics.loc['precision','DecisionTree'] = precision
smoteenn_resampled_metrics.loc['recall','DecisionTree'] = recall
smoteenn_resampled_metrics.loc['roc_auc','DecisionTree'] = roc_auc
smoteenn_resampled_metrics.loc['f1','DecisionTree'] = f1score

# Print different metrics score
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

# SMOTE+ENN with GaussianNB

In [None]:
#SMOTE + ENN
clf = GaussianNB()

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_metrics.loc['accuracy','GaussianNB'] = accuracy
smoteenn_resampled_metrics.loc['precision','GaussianNB'] = precision
smoteenn_resampled_metrics.loc['recall','GaussianNB'] = recall
smoteenn_resampled_metrics.loc['roc_auc','GaussianNB'] = roc_auc
smoteenn_resampled_metrics.loc['f1','GaussianNB'] = f1score

# Print the different metrics score
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

# SMOTE+ENN with RandomForestClassifier

In [None]:
#SMOTE + ENN
clf = RandomForestClassifier()

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_metrics.loc['accuracy','RandomForest'] = accuracy
smoteenn_resampled_metrics.loc['precision','RandomForest'] = precision
smoteenn_resampled_metrics.loc['recall','RandomForest'] = recall
smoteenn_resampled_metrics.loc['roc_auc','RandomForest'] = roc_auc
smoteenn_resampled_metrics.loc['f1','RandomForest'] = f1score

# Print different metrics score
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

# SMOTE+ENN with SVM

In [None]:
#SMOTE + ENN
clf = svm.SVC(kernel='linear')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_metrics.loc['accuracy','SVM'] = accuracy
smoteenn_resampled_metrics.loc['precision','SVM'] = precision
smoteenn_resampled_metrics.loc['recall','SVM'] = recall
smoteenn_resampled_metrics.loc['roc_auc','SVM'] = roc_auc
smoteenn_resampled_metrics.loc['f1','SVM'] = f1score

# Print different metrics score
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
100*smoteenn_resampled_metrics

Logistic Regression has the best f1 score compared to all other classification model, so I will choose Logistic Regression for this project. Since the F1-score is significantly better than a random classifier, I would recommend this Logistic Regression model to predict credit card default. Let's try to tune all our model to see if we can improve the performance and then decide which classification model would be the best. 

# Hyperparameter Tuning for model performance improvement

In [None]:
#Preparing a dataframe for model analysis
#Data frame for evaluation metrics
smoteenn_resampled_tuned_metrics = pd.DataFrame(index=['roc_auc', 'accuracy','precision','recall','f1'],columns=['LogisticReg','DecisionTree','GaussianNB','RandomForest','SVM'])

# SMOTE+ENN with Logistic Regression

In [None]:
#SMOTE + ENN
os_us = SMOTEENN(ratio=0.5, random_state=0)
X_train_res, y_train_res = os_us.fit_sample(X_train, y_train)

print('Distribution of class labels before resampling {}'.format(Counter(y_train)))
print('Distribution of class labels after resampling {}'.format(Counter(y_train_res)))

clf_base = LogisticRegression(solver='lbfgs')
grid = {'C': 10.0 ** np.arange(-2,3)}

clf = GridSearchCV(clf_base, grid, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_tuned_metrics.loc['accuracy','LogisticReg'] = accuracy
smoteenn_resampled_tuned_metrics.loc['precision','LogisticReg'] = precision
smoteenn_resampled_tuned_metrics.loc['recall','LogisticReg'] = recall
smoteenn_resampled_tuned_metrics.loc['roc_auc','LogisticReg'] = roc_auc
smoteenn_resampled_tuned_metrics.loc['f1','LogisticReg'] = f1score

# Print the tuned parameters and score
print('Tuned Logistic Regression with SMOTE+ENN Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

# SMOTE+ENN with Decision Tree Classifier

In [None]:
clf_base = DecisionTreeClassifier()

# Setup the parameters and distributions to sample from: param_dist
param_grid = {"max_depth": range(1,10),
             'max_features': range(1,10)}

clf = RandomizedSearchCV(clf_base, param_grid, cv=5, n_iter=7, scoring='f1')

clf.fit(X_train_res, y_train_res)

#to store the predicted labels
y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_tuned_metrics.loc['accuracy','DecisionTree'] = accuracy
smoteenn_resampled_tuned_metrics.loc['precision','DecisionTree'] = precision
smoteenn_resampled_tuned_metrics.loc['recall','DecisionTree'] = recall
smoteenn_resampled_tuned_metrics.loc['roc_auc','DecisionTree'] = roc_auc
smoteenn_resampled_tuned_metrics.loc['f1','DecisionTree'] = f1score

# Print the tuned parameters and score
print('Tuned Decision Tree with SMOTE+ENN Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

# SMOTE+ENN with GaussianNB

In [None]:
clf_base = GaussianNB()

# Setup the parameters and distributions to sample from: param_dist
param_dist = {'priors':[None]}

clf = GridSearchCV(clf_base, param_dist, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

#to store the predicted labels
y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_tuned_metrics.loc['accuracy','GaussianNB'] = accuracy
smoteenn_resampled_tuned_metrics.loc['precision','GaussianNB'] = precision
smoteenn_resampled_tuned_metrics.loc['recall','GaussianNB'] = recall
smoteenn_resampled_tuned_metrics.loc['roc_auc','GaussianNB'] = roc_auc
smoteenn_resampled_tuned_metrics.loc['f1','GaussianNB'] = f1score

# Print the tuned parameters and score
print('Tuned GaussianNB with SMOTE+ENN Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

# SMOTE+ENN with Random Forest Classifier

In [None]:
clf_base = RandomForestClassifier()

# Setup the parameters and distributions to sample from: param_dist
param_dist = {'max_depth': range(1,10),
             'max_features': range(1,10),
             'n_estimators': [10,50,100,150,200,250,300,350,400,500]}

clf = RandomizedSearchCV(clf_base, param_dist, cv=5,n_jobs=8, scoring='f1')

clf.fit(X_train_res, y_train_res)

#to store the predicted labels
y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_tuned_metrics.loc['accuracy','RandomForest'] = accuracy
smoteenn_resampled_tuned_metrics.loc['precision','RandomForest'] = precision
smoteenn_resampled_tuned_metrics.loc['recall','RandomForest'] = recall
smoteenn_resampled_tuned_metrics.loc['roc_auc','RandomForest'] = roc_auc
smoteenn_resampled_tuned_metrics.loc['f1','RandomForest'] = f1score

# Print the tuned parameters and score
print('Tuned Random Forest with SMOTE+ENN Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

# SMOTE+ENN with Support Vector Machine

In [None]:
clf_base = svm.SVC(kernel='linear')

# Setup the parameters and distributions to sample from: param_dist
param_dist = {'C': [0.001,0.01,0.1,1], 'gamma': [0.001,0.01,0.1,1]}

clf = RandomizedSearchCV(clf_base, param_dist, cv=5, scoring='f1')

clf.fit(X_train_res, y_train_res)

#to store the predicted labels
y_pred_test = clf.predict(X_test)

#to calculate the auc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
#calculate other metrics scores
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
recall = recall_score(y_pred=y_pred_test,y_true=y_test)
f1score = f1_score(y_true=y_test, y_pred=y_pred_test)

#entering tuned metrics score to the dataframe
smoteenn_resampled_tuned_metrics.loc['accuracy','SVM'] = accuracy
smoteenn_resampled_tuned_metrics.loc['precision','SVM'] = precision
smoteenn_resampled_tuned_metrics.loc['recall','SVM'] = recall
smoteenn_resampled_tuned_metrics.loc['roc_auc','SVM'] = roc_auc
smoteenn_resampled_tuned_metrics.loc['f1','SVM'] = f1score

# Print the tuned parameters and score
print('Tuned SVM with SMOTE+ENN Parameters: {}'.format(clf.best_params_))
print('Best score is {0:.4f}'.format(clf.best_score_))
print('AUC Score: {0:.4f}'.format(roc_auc))
print('Accuracy Score: {0:.4f}'.format(accuracy))
print('Precision Score: {0:.4f}'.format(precision))
print('Recall Score: {0:.4f}'.format(recall))
print('f1 score: {0:.4f}'.format(f1score))

In [None]:
100*smoteenn_resampled_tuned_metrics

In [None]:
100*smoteenn_resampled_metrics

After tuning the hyperparameter of all different model, we can see that the f1 score for Decision Tree had decreased after hyperparameter tuning, and the f1 score for Random Forest has improved and increased. Logistic Regression, GaussianNB, and SVM has not changed even after tuning. 

In [None]:
#plot a horizontal bar plot for the metrics of different classifier
fig, ax = plt.subplots(figsize=(15,8))
smoteenn_resampled_metrics.loc['f1'].plot(kind='barh', ax=ax)
ax.axvline(max(smoteenn_resampled_metrics.loc['f1']))
ax.grid()

Logistic Regression has the best f1 score compared to all other classification model, so I will choose Logistic Regression for this project. Since the F1-score is significantly better than a random classifier, I would recommend this Logistic Regression model to predict credit card default. 