In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Importing other libraries

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Reading the data 

data=pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')

In [None]:
data.head()

#### About the dataset 

##### Objective:- 
To predict if a customer would default on the payment of the loan next month. 

##### Columns:- 

There are 25 columns:- 

* ID: ID of each client
* LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
* SEX: Gender (1=male, 2=female)
* EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
* MARRIAGE: Marital status (1=married, 2=single, 3=others)
* AGE: Age in years
* PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)
* PAY_2: Repayment status in August, 2005 (scale same as above)
* PAY_3: Repayment status in July, 2005 (scale same as above)
* PAY_4: Repayment status in June, 2005 (scale same as above)
* PAY_5: Repayment status in May, 2005 (scale same as above)
* PAY_6: Repayment status in April, 2005 (scale same as above)
* BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
* BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
* BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
* BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
* BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
* BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
* PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
* PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
* PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
* PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
* PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
* PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
* default.payment.next.month: Default payment (1=yes, 0=no)



In [None]:
# Looking at the data 

data.info()

As we can see, we don't have any objective type columns. All are Numerical Values only. 

In [None]:
# Checking for Null Values

data.isnull().sum()

There are no NULL values in the dataset. 

In [None]:
#Looking at the type of data present ( to check if we have any categorical data )

{column:data[column].unique() for column in data.columns}

In [None]:
#Looking at the length of the unique column values would be better 

{column:len(data[column].unique()) for column in data.columns}

It seems like we have 10 (4+6) categorical columns:- 

1. SEX
2. EDUCATION
3. MARRIAGE
4. default.payment.next.month 

Repayment status for all the months:- 
1. PAY_0
2. PAY_2
3. PAY_3
4. PAY_4
5. PAY_5
6. PAY_6


In [None]:
data.rename(columns={'default.payment.next.month':'default_payment'},inplace=True)

#### UNIVARIATE ANALYSIS: Looking at the type of data present in different columns

##### Looking at the Categorical Data

In [None]:
cat_cols=['SEX','EDUCATION','MARRIAGE','default_payment']

fig,ax=plt.subplots(1,4,figsize=(25,5))

for cols,subplots in zip(cat_cols,ax.flatten()):
    sns.countplot(x=data[cols],ax=subplots)
    

* SEX: Gender (1=male, 2=female)
* EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
* MARRIAGE: Marital status (1=married, 2=single, 3=others)
* Default payment (1=yes, 0=no)

##### We can see that there are clearly some missing values in EDUCATION and MARRIAGE that were labeled as 0

In [None]:
ed=(data['EDUCATION']==0).sum()
m=(data['MARRIAGE']==0).sum()
p_ed=(ed/len(data['EDUCATION']))*100
m_ed=(m/len(data['MARRIAGE']))*100

print("Number of missing values for:\nEDUCATION= ",ed,"\nMARRIAGE= ",m)

print("Percentage of missing values for:\nEDUCATION= ",round(p_ed,2),"%","\nMARRIAGE= ",round(m_ed,2),"%")

We can replace the missing values with the most frequently occuring value. 

In [None]:
edu=data['EDUCATION'].mode()[0]
mar=data['MARRIAGE'].mode()[0]

dataset=data.copy()

dataset['EDUCATION'].replace({0:edu},inplace=True)
dataset['MARRIAGE'].replace({0:mar},inplace=True)

EDUCATION column: we can combine 5 & 6 since both are unknowns

In [None]:
dataset['EDUCATION'].replace({6:5},inplace=True)

In [None]:
unk=(dataset['EDUCATION']==5).sum()

print("Percentage of unknown in EDUCATION =", round((unk/len(dataset['EDUCATION']))*100,2),"%")

##### Plotting Again

In [None]:
fig,ax=plt.subplots(1,4,figsize=(25,5))

for cols,subplots in zip(cat_cols,ax.flatten()):
    sns.countplot(x=dataset[cols],ax=subplots)

In [None]:
data_copy=dataset.copy()

data_copy['SEX'].replace({1:'Male',2:'Female'},inplace=True)
data_copy['EDUCATION'].replace({1:'graduate school', 2:'university', 3:'high school', 4:'others', 5:'unknown', 6:'unknown'},inplace=True)
data_copy['MARRIAGE'].replace({1:'married', 2:'single', 3:'others'},inplace=True)
data_copy['default_payment'].replace({1:'Yes',0:'No'},inplace=True)


fig,ax=plt.subplots(1,4,figsize=(25,5))

for cols,subplots in zip(cat_cols,ax.flatten()):
    sns.countplot(x=data_copy[cols],ax=subplots)
    
    for label in subplots.get_xticklabels():
        label.set_rotation(90)

plt.show()

##### INSIGHTS:-

1. We have more number of Female Customers who took the loan. 
2. Most people have a University Education. 
3. Most people are single, followed closely by married. 
4. Most people did not default on their loan, so we have an imbalanced dataset. 

In [None]:
# Vizualizing the imbalance 

yes=(((dataset['default_payment']==1).sum())/len(dataset['default_payment']))*100
no=(((dataset['default_payment']==0).sum())/len(dataset['default_payment']))*100

x=[yes,no]

plt.pie(x,labels=['Yes','No'],colors=['darksalmon', 'lightgreen'],radius=2,autopct='%1.0f%%')
plt.title('DEFAULT PAYMENTS')
plt.show()

Clearly, we have an imbalance in the dataset with only 22% of the loans defaulted. 

##### Vizualizing the other categorical columns ( Repayment Status)

PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)
PAY_2: Repayment status in August, 2005 (scale same as above)
PAY_3: Repayment status in July, 2005 (scale same as above)
PAY_4: Repayment status in June, 2005 (scale same as above)
PAY_5: Repayment status in May, 2005 (scale same as above)
PAY_6: Repayment status in April, 2005 (scale same as above)

In [None]:
dataset.rename({'PAY_0':'Repayment_Sept','PAY_2':'Repayment_Aug','PAY_3':'Repayment_July','PAY_4':'Repayment_June','PAY_5':'Repayment_May','PAY_6':'Repayment_April'},axis=1,inplace=True)

In [None]:
repayment=['Repayment_April','Repayment_May','Repayment_June','Repayment_July','Repayment_Aug','Repayment_Sept']
fig,ax=plt.subplots(2,3,figsize=(20,10))

for cols,subplots in zip(repayment,ax.flatten()):
    sns.countplot(x=dataset[cols],ax=subplots)
    
    for label in subplots.get_xticklabels():
        label.set_rotation(90)

plt.show()

-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above

* We can see some inconsistencies in the data. 
* -2 and 0 make no sense

We can modify as follow:-
* 0: Paid Duly 
* 1-9: Payment delay for 1-9 months

Therefore we need to replace -1,-2 with 0. 


In [None]:
for cols in repayment:
    dataset[cols].replace({-2:0,-1:0},inplace=True)

In [None]:
fig,ax=plt.subplots(2,3,figsize=(20,10))

plt.suptitle('No. of Months the Payment was Delayed by')

for cols,subplots in zip(repayment,ax.flatten()):
    sns.countplot(x=dataset[cols],ax=subplots)
    
    for label in subplots.get_xticklabels():
        label.set_rotation(90)

plt.show()

* We can see that most of the payments were made on time. 
* Other than that there was a maximum delay of 2 months for

#### Looking at the Non-Categorical Data

Non-Categorical columns with the number of unique values:-
1. 'ID': 30000,
2. 'LIMIT_BAL': 81,
3. 'BILL_AMT1': 22723,
4. 'BILL_AMT2': 22346,
5. 'BILL_AMT3': 22026,
6. 'BILL_AMT4': 21548,
7. 'BILL_AMT5': 21010,
8. 'BILL_AMT6': 20604,
9. 'PAY_AMT1': 7943,
10. 'PAY_AMT2': 7899,
11. 'PAY_AMT3': 7518,
12. 'PAY_AMT4': 6937,
13. 'PAY_AMT5': 6897,
14. 'PAY_AMT6': 6939,


In [None]:
#Changing the column names to understand the data better

dataset.rename({'BILL_AMT1':'Bill_Sept','BILL_AMT2':'Bill_Aug','BILL_AMT3':'Bill_July','BILL_AMT4':'Bill_June','BILL_AMT5':'Bill_May','BILL_AMT6':'Bill_April'},axis=1,inplace=True)

dataset.rename({'PAY_AMT1':'Prev_Payment_Sept','PAY_AMT2':'Prev_Payment_Aug','PAY_AMT3':'Prev_Payment_July','PAY_AMT4':'Prev_Payment_June','PAY_AMT5':'Prev_Payment_May','PAY_AMT6':'Prev_Payment_April'},axis=1,inplace=True)

In [None]:
dataset.columns

In [None]:
#Amount of Credit Given 

plt.figure(figsize=(20,10))
sns.histplot(x=dataset['LIMIT_BAL'],kde=True,bins=100)
plt.xlabel('Amount of Credit Given (NT Dollars)')
plt.ticklabel_format(axis='x', style='plain',useOffset=None)
plt.locator_params(axis="x", nbins=30)
plt.axvline(dataset['LIMIT_BAL'].mean(), color='r', linestyle='dashed', linewidth=2)
plt.axvline(dataset['LIMIT_BAL'].mode()[0], color='yellow', linestyle='dashed', linewidth=3)
min_ylim, max_ylim = plt.ylim()
plt.text(dataset['LIMIT_BAL'].mean()*1.1, max_ylim*0.9, 'Mean: {:.2f}'.format(dataset['LIMIT_BAL'].mean()))
plt.text(dataset['LIMIT_BAL'].mode()[0]*1.1, max_ylim*0.9, 'Mode: {:.2f}'.format(dataset['LIMIT_BAL'].mode()[0]))


plt.title('AMOUNT OF CREDIT GIVEN')
plt.show()

In [None]:
dataset['LIMIT_BAL'].describe()

**We can see that most loans were around 50,000 dollars.**

In [None]:
#BILL AMOUNTS FOR EACH MONTH 

bills=['Bill_April','Bill_May','Bill_June','Bill_July','Bill_Aug','Bill_Sept']

plt.figure(figsize=(20,5))

min_ylim, max_ylim = plt.ylim()

sns.kdeplot(x=dataset[bills[0]],color='r',label='April')
plt.axvline(dataset[bills[0]].mean(), color='r', linestyle='dashed', linewidth=1)


sns.kdeplot(x=dataset[bills[1]],color='b',label='May')
plt.axvline(dataset[bills[1]].mean(), color='b', linestyle='dashed', linewidth=1)

sns.kdeplot(x=dataset[bills[2]],color='g',label='June')
plt.axvline(dataset[bills[2]].mean(), color='g', linestyle='dashed', linewidth=1)

sns.kdeplot(x=dataset[bills[3]],color='yellow',label='July')
plt.axvline(dataset[bills[3]].mean(), color='yellow', linestyle='dashed', linewidth=1)

sns.kdeplot(x=dataset[bills[4]],color='pink',label='August')
plt.axvline(dataset[bills[4]].mean(), color='pink', linestyle='dashed', linewidth=1)

sns.kdeplot(x=dataset[bills[5]],color='k',label='September')
plt.axvline(dataset[bills[5]].mean(), color='k', linestyle='dashed', linewidth=1)


plt.ticklabel_format(axis='x', style='plain',useOffset=None)
plt.locator_params(axis="x", nbins=60)
plt.xticks(rotation=90)
plt.xlabel('Bill Amount')

means=[dataset[bills[0]].mean(),dataset[bills[1]].mean(),dataset[bills[2]].mean(),dataset[bills[3]].mean(),dataset[bills[4]].mean(),dataset[bills[5]].mean()]
m=round(sum(means)/len(means),2)
min_ylim, max_ylim = plt.ylim()
plt.text(m*2.1, max_ylim*0.9, 'Average Mean: {:.2f}'.format(m))


plt.axvline(dataset[bills[0]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[bills[1]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[bills[2]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[bills[3]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[bills[4]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[bills[5]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)

modes=[dataset[bills[0]].mode()[0],dataset[bills[1]].mode()[0],dataset[bills[2]].mode()[0],dataset[bills[3]].mode()[0],dataset[bills[4]].mode()[0],dataset[bills[5]].mode()[0]]
mo=sum(modes)/len(modes)

plt.text(mo*1.1, max_ylim*0.7, 'Average Mode: {:.2f}'.format(mo))

plt.title('MONTHLY BILL AMOUNTS')
plt.legend()
plt.show()



As we can see the average monthly bill amount is around 50,000 NT dollars. 

In [None]:
s=[]


for x in bills:
    s.append((dataset[x]==0).sum())
   

billszero=[]
for x in s:
    billszero.append((x/len(dataset))*100)
    
billszero

In [None]:
#Previous Payment Amounts for Each Month

prev=['Prev_Payment_April','Prev_Payment_May','Prev_Payment_June','Prev_Payment_July','Prev_Payment_Aug','Prev_Payment_Sept']

plt.figure(figsize=(20,5))

min_ylim, max_ylim = plt.ylim()

sns.kdeplot(x=dataset[prev[0]],color='r',label='April')
plt.axvline(dataset[prev[0]].mean(), color='r', linestyle='dashed', linewidth=1)


sns.kdeplot(x=dataset[prev[1]],color='b',label='May')
plt.axvline(dataset[prev[1]].mean(), color='b', linestyle='dashed', linewidth=1)

sns.kdeplot(x=dataset[prev[2]],color='g',label='June')
plt.axvline(dataset[prev[2]].mean(), color='g', linestyle='dashed', linewidth=1)

sns.kdeplot(x=dataset[prev[3]],color='yellow',label='July')
plt.axvline(dataset[prev[3]].mean(), color='yellow', linestyle='dashed', linewidth=1)

sns.kdeplot(x=dataset[prev[4]],color='pink',label='August')
plt.axvline(dataset[prev[4]].mean(), color='pink', linestyle='dashed', linewidth=1)

sns.kdeplot(x=dataset[prev[5]],color='k',label='September')
plt.axvline(dataset[prev[5]].mean(), color='k', linestyle='dashed', linewidth=1)


plt.ticklabel_format(axis='x', style='plain',useOffset=None)
plt.locator_params(axis="x", nbins=60)
plt.xticks(rotation=90)
plt.xlabel('Bill Amount')

means=[dataset[prev[0]].mean(),dataset[prev[1]].mean(),dataset[prev[2]].mean(),dataset[prev[3]].mean(),dataset[prev[4]].mean(),dataset[prev[5]].mean()]
m=round(sum(means)/len(means),2)
min_ylim, max_ylim = plt.ylim()
plt.text(m*2.1, max_ylim*0.9, 'Average Mean: {:.2f}'.format(m))


plt.axvline(dataset[prev[0]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[prev[1]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[prev[2]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[prev[3]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[prev[4]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)
plt.axvline(dataset[prev[5]].mode()[0], color='lightblue', linestyle='dashed', linewidth=1)

modes=[dataset[prev[0]].mode()[0],dataset[prev[1]].mode()[0],dataset[prev[2]].mode()[0],dataset[prev[3]].mode()[0],dataset[prev[4]].mode()[0],dataset[prev[5]].mode()[0]]
mo=sum(modes)/len(modes)

plt.text(mo*1.1, max_ylim*0.7, 'Average Mode: {:.2f}'.format(mo))

plt.title('PREVIOUS PAYMENTS MADE')
plt.legend()
plt.show()


**On an average 5000 NT dollars were paid previously**

### EDA by different Demographics 

How does the probability of default payment vary by categories of different demographic variables?

In [None]:
dataset

#### BY GENDER 

In [None]:
sns.countplot(x=data_copy['SEX'],hue='default_payment',data=dataset)
plt.show()

#### BY EDUCATION 

#### BY MARRIAGE

#### BY AGE