<a href="https://colab.research.google.com/github/sagunkayastha/CAI_Workshop/blob/main/Workshop_2/Intro_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import datetime
import time
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!wget https://raw.githubusercontent.com/sagunkayastha/CAI_Workshop/main/Workshop_2/UCI_Credit_Card.csv

In [None]:
# Read the data using pandas into a dataframe called df
df = pd.read_csv('UCI_Credit_Card.csv', delimiter=',')
df.dataframeName = 'UCI_Credit_Card.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
# Print the first 5 rows of the dataframe
df.head()

- ID: An identification number assigned to each individual.
- LIMIT_BAL: The amount of available credit (in NT dollar). Includes both individual and family/supplementary credit.
- SEX: Gender (1 = male, 2 = female).
- EDUCATION: Level of education (1 = graduate school, 2 = university, 3 = high school, 4 = others).
- MARRIAGE: Marital status (1 = married, 2 = single, 3 = others).
- AGE: Age in years.
- PAY_0, PAY_2, ..., PAY_6: Repayment status in past months. The scale is from -1 to 9, where -1 = pay duly, 1 = payment delay for one month, 2 = payment delay for two months, ... , 9 = payment delay for nine months and above.
- BILL_AMT1, BILL_AMT2, ..., BILL_AMT6: Amount of bill statement (in NT dollar).
- PAY_AMT1, PAY_AMT2, ..., PAY_AMT6: Amount of previous payment (in NT dollar).
- default.payment.next.month: Default payment for the next month (1 = yes, 0 = no).

In [None]:
# we are going to rename some of the columns to make them easier to work with
df.rename(columns={'default.payment.next.month':'def_pay'}, inplace=True)
df.rename(columns={'PAY_0':'PAY_1'}, inplace=True)

In [None]:
# Descriptive statistics for each column
summary_stats = df.describe()
summary_stats

In [None]:

# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Draw histograms for each feature
df.hist(figsize=(16, 14), bins=30)
plt.suptitle('Feature Distributions', fontsize=20)
plt.show()


In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Generate a heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap', fontsize=20)

plt.show()

In [None]:
# Focus on these columns
cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2', 'BILL_AMT1', 'BILL_AMT2', 'def_pay']

# The pairwise correlations
df[cols].corr()

In [None]:
# Use
ax = sns.heatmap(
    df[cols].corr(),annot=True,
    cmap=sns.cubehelix_palette(20, light=0.95, dark=0.15),
)
ax.xaxis.tick_top() # move labels to the top


In [None]:
#
def_cnt = (df.def_pay.value_counts(normalize=True)*100)
def_cnt.plot.bar(figsize=(6,6))
plt.xticks(fontsize=12, rotation=0)
plt.yticks(fontsize=12)
plt.title("Probability Of Defaulting Payment Next Month", fontsize=15)
for x,y in zip([0,1],def_cnt):
    plt.text(x,y,y,fontsize=12)
plt.show()

In [None]:
# Plot the distribution of LIMIT_BAL and AGE
plt.subplots(figsize=(20,5))
plt.subplot(121)
sns.distplot(df.LIMIT_BAL)

plt.subplot(122)
sns.distplot(df.AGE)

plt.show()

In [None]:
# Define bins and names for age groups
bins = [20,30,40,50,60,70,80]
names = ['21-30','31-40','41-50','51-60','61-70','71-80']

# Create a new column in the DataFrame to categorize age into bins
# 'right=True' includes the right bin edge
df['AGE_BIN'] = pd.cut(x=df.AGE, bins=bins, labels=names, right=True)

# Count the number of occurrences for each age group
age_cnt = df.AGE_BIN.value_counts()

# Count the number of occurrences for each age group where 'def_pay' is 0
age_0 = (df.AGE_BIN[df['def_pay'] == 0].value_counts())

# Count the number of occurrences for each age group where 'def_pay' is 1
age_1 = (df.AGE_BIN[df['def_pay'] == 1].value_counts())

# Initialize the plot
plt.subplots(figsize=(8,5))

# Create a bar plot for the age groups where 'def_pay' is 0
plt.bar(age_0.index, age_0.values, label='0')

# Create a bar plot for the age groups where 'def_pay' is 1
plt.bar(age_1.index, age_1.values, label='1')

# Annotate the bar chart with the count values for 'def_pay' 0
for x, y in zip(names, age_0):
    plt.text(x, y, y, fontsize=12)

# Annotate the bar chart with the count values for 'def_pay' 1
for x, y in zip(names, age_1):
    plt.text(x, y, y, fontsize=12)

# Customize tick labels
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Add title to the plot
plt.title("Number of clients in each age group", fontsize=15)

# Add legend to the plot
plt.legend(loc='upper right', fontsize=15)

# Display the plot
plt.show()

In [None]:
df.drop('AGE_BIN', axis=1, inplace=True)

# Modelling

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,mean_squared_error
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Import necessary modules (not shown in the original code)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Drop the 'def_pay' column to separate features from the target label
df_X = df.drop(['def_pay'], axis=1)

# Isolate the 'def_pay' column as the target label
df_y = df.def_pay

# Split the data into training and test sets
# 80% of the data will be used for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=10)

# Initialize the Logistic Regression model
model1 = LogisticRegression()

# Fit the model to the training data
model1.fit(X_train, y_train)

# Use the trained model to make predictions on the test data
y_pred = model1.predict(X_test)

# Display the classification report (Precision, Recall, F1-Score)
print(classification_report(y_pred, y_test))

# Display the confusion matrix
print(confusion_matrix(y_pred, y_test))

# Display the accuracy score of the model on the test data
print('\nAccuracy Score for model1: ', accuracy_score(y_pred, y_test))


In [None]:
df_X = df.drop(['def_pay','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6'], axis=1)
df_y = df.def_pay

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=10)

model3 = LogisticRegression()
model3.fit(X_train, y_train)

y_pred = model3.predict(X_test)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print('\nAccuracy Score for model3: ', accuracy_score(y_pred,y_test))

#

As we can see even now the accuracy does not change

In [None]:
df_X = df[['SEX','MARRIAGE','AGE','BILL_AMT1','EDUCATION','PAY_1']]
df_y = df.def_pay

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.1, random_state=20)

model4 = LogisticRegression()
model4.fit(X_train, y_train)

y_pred = model4.predict(X_test)
y_train_pred = model4.predict(X_train)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print('\nTest Accuracy Score for model4: ', accuracy_score(y_pred,y_test))
print('\nTrain Accuracy Score for model4: ', accuracy_score(y_train_pred,y_train))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df_X = df[['SEX','MARRIAGE','AGE','BILL_AMT1','EDUCATION','PAY_1']]
df_y = df.def_pay

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=20)

# Creating the Decision Tree model

clf = DecisionTreeClassifier(random_state=42, )

# Fitting the model to the training data
clf.fit(X_train, y_train)

# Making predictions on the test set
y_pred = clf.predict(X_test)

y_train_pred = clf.predict(X_train)


# Evaluating the model
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print('\nTest Accuracy Score for Deecision Tree: ', accuracy_score(y_pred,y_test))
print('\nTrain Accuracy Score for Deecision Tree: ', accuracy_score(y_train_pred,y_train))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Select specific columns for features and target label
df_X = df[['SEX','MARRIAGE','AGE','BILL_AMT1','EDUCATION','PAY_1']]
df_y = df.def_pay

# Split the data into training and test sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=20)

# Initialize the Decision Tree Classifier with specific hyperparameters
clf = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_leaf=50)

# Fit the model to the training data
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Make predictions on the training set (for evaluating training accuracy)
y_train_pred = clf.predict(X_train)

# Evaluate the model using classification metrics
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

# Print the test accuracy score
print('\nTest Accuracy Score for Decision Tree: ', accuracy_score(y_pred,y_test))

# Print the training accuracy score
print('\nTrain Accuracy Score for Decision Tree: ', accuracy_score(y_train_pred,y_train))

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
features = ['SEX','MARRIAGE','AGE','BILL_AMT1','EDUCATION','PAY_1']
# Plotting the Decision Tree (Limited depth for better visibility)
plt.figure(figsize=(20, 10), dpi=500)
plot_tree(clf, max_depth=5, feature_names=features, class_names=['Not Default', 'Default'], filled=True)
plt.show()