<a href="https://colab.research.google.com/github/sagunkayastha/CAI_Workshop/blob/main/Workshop_s3/DL_intro_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import datetime
import time
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!wget https://raw.githubusercontent.com/sagunkayastha/CAI_Workshop/main/Workshop_s3/UCI_Credit_Card.csv

In [None]:
# Read the data using pandas into a dataframe called df
df = pd.read_csv('UCI_Credit_Card.csv', delimiter=',')
df.dataframeName = 'UCI_Credit_Card.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
# Print the first 5 rows of the dataframe
df.head()


<div style="border-radius: 30px 0 30px 0px; border:#FFA500 solid; padding: 15px; background-color: #0d202b; font-size:100%; text-align:left ">
   This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005.
</div>


<div style="border-radius: 10px; border: 2px solid #d3d3d3; padding: 15px; background-color: #0d202b;">
    <h2 style="color: #007c3e;">Variables</h2>
    <p><strong>ID:</strong> ID of each client</p>
    <p><strong>LIMIT_BAL:</strong> Amount of given credit in NT dollars (includes individual and family/supplementary credit)</p>
    <p><strong>SEX:</strong> Gender (1=male, 2=female)</p>
    <p><strong>EDUCATION:</strong> (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)</p>
    <p><strong>MARRIAGE:</strong> Marital status (1=married, 2=single, 3=others)</p>
    <p><strong>AGE:</strong> Age in years</p>
    <p><strong>PAY_0:</strong> Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)</p>
    <p><strong>PAY_2:</strong> Repayment status in August, 2005 (scale same as above)</p>
    <p><strong>PAY_3:</strong> Repayment status in July, 2005 (scale same as above)</p>
    <p><strong>PAY_4:</strong> Repayment status in June, 2005 (scale same as above)</p>
    <p><strong>PAY_5:</strong> Repayment status in May, 2005 (scale same as above)</p>
    <p><strong>PAY_6:</strong> Repayment status in April, 2005 (scale same as above)</p>
    <p><strong>BILL_AMT1:</strong> Amount of bill statement in September, 2005 (NT dollar)</p>
    <p><strong>BILL_AMT2:</strong> Amount of bill statement in August, 2005 (NT dollar)</p>
    <p><strong>BILL_AMT3:</strong> Amount of bill statement in July, 2005 (NT dollar)</p>
    <p><strong>BILL_AMT4:</strong> Amount of bill statement in June, 2005 (NT dollar)</p>
    <p><strong>BILL_AMT5:</strong> Amount of bill statement in May, 2005 (NT dollar)</p>
    <p><strong>BILL_AMT6:</strong> Amount of bill statement in April, 2005 (NT dollar)</p>
    <p><strong>PAY_AMT1:</strong> Amount of previous payment in September, 2005 (NT dollar)</p>
    <p><strong>PAY_AMT2:</strong> Amount of previous payment in August, 2005 (NT dollar)</p>
    <p><strong>PAY_AMT3:</strong> Amount of previous payment in July, 2005 (NT dollar)</p>
    <p><strong>PAY_AMT4:</strong> Amount of previous payment in June, 2005 (NT dollar)</p>
    <p><strong>PAY_AMT5:</strong> Amount of previous payment in May, 2005 (NT dollar)</p>
    <p><strong>PAY_AMT6:</strong> Amount of previous payment in April, 2005 (NT dollar)</p>
    <p><strong>default.payment.next.month:</strong> Default payment (1=yes, 0=no)</p>
</div>


In [None]:
# we are going to rename some of the columns to make them easier to work with
df.rename(columns={'default.payment.next.month':'def_pay'}, inplace=True)
df.rename(columns={'PAY_0':'PAY_1'}, inplace=True)

In [None]:
# Descriptive statistics for each column
summary_stats = df.describe()
summary_stats

In [None]:
df.info()

In [None]:

# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Draw histograms for each feature
df.hist(figsize=(16, 14), bins=30)
plt.suptitle('Feature Distributions', fontsize=20)
plt.show()


In [None]:
# How many defaulters
perc_default = df.def_pay.sum() / len(df.def_pay)
print(f'The percentage of defaulters in the data is {perc_default*100} %')
df['def_pay'].value_counts().plot(kind='pie',explode=[0.1,0],autopct="%1.1f%%")
plt.plot()

In [None]:
pay_x_fts = ['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
plt.figure(figsize=(15,12))

for i,col in enumerate(pay_x_fts):
    plt.subplot(3,2,i + 1)
    ax = sns.barplot(x = col, y = "def_pay", data = df, palette = 'rocket', errorbar = None)
    plt.ylabel("% of Default", fontsize= 12)
    plt.ylim(0,1.2)
    plt.tight_layout()

    for p in ax.patches:
        ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.09, p.get_height()+0.03),fontsize=13)

plt.show()

-> Most customers are duly paying their credit card bills. And it's pretty clear that their likelihood of default are much lower than the rest.


-> Credit card holders who consistently delay their payments for more than 3 months are significantly more likely to face defaults.

In [None]:
# Age Distribution of Credit Card Holders
plt.figure(figsize=(10, 6))
sns.histplot(df['AGE'], bins=25, kde=True, color='skyblue')
plt.title('Age Distribution of Credit Card Holders')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

"""The histogram above shows the age distribution of credit card holders in the dataset, including a density estimate to visualize the overall shape and spread of ages. This can help us understand the demographic makeup of the dataset in terms of age.
"""

In [None]:
# Define bins and names for age groups
bins = [20,30,40,50,60,70,80]
names = ['21-30','31-40','41-50','51-60','61-70','71-80']


df_temp = df.copy()
# Create a new column in the DataFrame to categorize age into bins
# 'right=True' includes the right bin edge
df_temp['AGE_BIN'] = pd.cut(x=df.AGE, bins=bins, labels=names, right=True)

# Count the number of occurrences for each age group
age_cnt = df_temp.AGE_BIN.value_counts()

# Count the number of occurrences for each age group where 'def_pay' is 0
age_0 = (df_temp.AGE_BIN[df_temp['def_pay'] == 0].value_counts())

# Count the number of occurrences for each age group where 'def_pay' is 1
age_1 = (df_temp.AGE_BIN[df_temp['def_pay'] == 1].value_counts())

# Initialize the plot
plt.subplots(figsize=(8,5))

# Create a bar plot for the age groups where 'def_pay' is 0
plt.bar(age_0.index, age_0.values, label='0')

# Create a bar plot for the age groups where 'def_pay' is 1
plt.bar(age_1.index, age_1.values, label='1')

# Annotate the bar chart with the count values for 'def_pay' 0
for x, y in zip(names, age_0):
    plt.text(x, y, y, fontsize=12)

# Annotate the bar chart with the count values for 'def_pay' 1
for x, y in zip(names, age_1):
    plt.text(x, y, y, fontsize=12)

# Customize tick labels
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Add title to the plot
plt.title("Number of clients in each age group", fontsize=15)

# Add legend to the plot
plt.legend(loc='upper right', fontsize=15)

# Display the plot
plt.show()

In [None]:
# Proportion of Defaults by Education Level
plt.figure(figsize=(10, 6))
sns.countplot(x='EDUCATION', hue='def_pay', data=df, palette="coolwarm")
plt.title('Defaults by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Count')
plt.legend(title='Default', labels=['No', 'Yes'])
plt.show()

"""
The plot above shows the counts of defaults versus non-defaults across different education levels. It provides a clear view of how education level might relate to the likelihood of defaulting on payments.
"""

In [None]:

data = df
grad =  data['EDUCATION'][data['EDUCATION']==1].count()
grad_default = data['EDUCATION'][(data['EDUCATION']==1)&(data['def_pay']==1)].count()

uni =  data['EDUCATION'][data['EDUCATION']==2].count()
uni_default = data['EDUCATION'][(data['EDUCATION']==2)&(data['def_pay']==1)].count()

high =  data['EDUCATION'][data['EDUCATION']==3].count()
high_default = data['EDUCATION'][(data['EDUCATION']==3)&(data['def_pay']==1)].count()

other =  data['EDUCATION'][data['EDUCATION'] > 3].count()
other_default = data['EDUCATION'][(data['EDUCATION'] > 3)&(data['def_pay']==1)].count()

total_education = [grad, uni, high, other]
default_education = [grad_default,uni_default,high_default, other_default]
degree = [1,2,3,4]
plt.bar(degree,total_education, color='m',alpha=0.5, label='Total')
plt.bar(degree,default_education, color='b',alpha=0.5, label='Default')

plt.xticks([1,2,3,4],['Grad School','University','High School','Other'])
plt.ylabel('Number of Accounts');plt.title('Fig.3 : Education ',fontweight="bold", size=12)
plt.legend();plt.show()

In [None]:
# Payment Status Overview for PAY_0
plt.figure(figsize=(10, 6))
sns.countplot(x='PAY_1', data=df, palette="viridis")
plt.title('Payment Status in the Latest Month')
plt.xlabel('Payment Status')
plt.ylabel('Frequency')
plt.show()

"""
The plot above illustrates the distribution of payment statuses for the latest month, providing insights into the payment behavior of the credit card users, including those who paid duly, those who delayed payments, and the extent of such delays.
"""


In [None]:
# Correlation Heatmap
plt.figure(figsize=(14, 10))
corr = df.iloc[:, 1:-1].corr() # Exclude ID and target variable for correlation
sns.heatmap(corr, cmap="YlGnBu", annot=False)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Boxplot of Credit Limit by Education Level
plt.figure(figsize=(10, 6))
sns.boxplot(x='EDUCATION', y='LIMIT_BAL', data=df, palette="Set2")
plt.title('Credit Limit by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Credit Limit')
plt.show()

"""
The boxplot above visualizes the distribution of credit limits across different education levels. This plot can reveal trends and disparities in credit limits, potentially reflecting the financial trustworthiness perceived by credit issuers based on education.
"""

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# Features and target variable
X = df.drop(['ID', 'def_pay'], axis=1)
y = df['def_pay']

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verifying the shape of the datasets
X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape


In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# Define the model
model = tf.keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train_scaled, y_train,
                    epochs=20,
                    batch_size=128,
                    validation_split=0.2,
                    verbose=1)
history = model.fit(X_train_scaled, y_train,
                    epochs=20,
                    batch_size=128,
                    validation_split=0.2,
                    verbose=1)



In [None]:
model.evaluate(X_test_scaled, y_test, verbose=1)


#### -----

In [None]:
# Focus on these columns
cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2', 'BILL_AMT1', 'BILL_AMT2', 'def_pay']
df_mod = df[cols]
# The pairwise correlations
df_mod.corr()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(14, 10))
corr = df_mod.iloc[:, 1:-1].corr() # Exclude ID and target variable for correlation
sns.heatmap(corr, cmap="YlGnBu", annot=False)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# Features and target variable
X = df_mod.drop(['def_pay'], axis=1)
y = df_mod['def_pay']

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verifying the shape of the datasets
X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape


In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# Define the model
model = tf.keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train_scaled, y_train,
                    epochs=20,
                    batch_size=128,
                    validation_split=0.2,
                    verbose=1)
history = model.fit(X_train_scaled, y_train,
                    epochs=20,
                    batch_size=128,
                    validation_split=0.2,
                    verbose=1)

