In [61]:
# Import the moduels
import pandas as pd
from pathlib import Path
import hvplot.pandas
import warnings

In [62]:
warnings.filterwarnings('ignore')

## Import the Pandas DataFrame

In [63]:
# Read in the CSV file as a Pandas Dataframe
ccinfo_df = pd.read_csv(
    Path("../Resources/cc_info_default.csv")
)

In [64]:
# Review the DataFrame
ccinfo_df.tail()

Unnamed: 0,limit_bal,education,marriage,age,bill_amt,pay_amt,default
4994,20000,secondary,yes,36,110994,7293,0
4995,180000,other,yes,34,35240,22066,0
4996,200000,secondary,yes,45,691806,21443,1
4997,310000,post-grad,yes,44,1548067,72000,0
4998,160000,primary,no,40,4440,3725,0


In [65]:
# Review the info
ccinfo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   limit_bal  4999 non-null   int64 
 1   education  4999 non-null   object
 2   marriage   4999 non-null   object
 3   age        4999 non-null   int64 
 4   bill_amt   4999 non-null   int64 
 5   pay_amt    4999 non-null   int64 
 6   default    4999 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 273.5+ KB


## Transform "education" column with get_dummies

In [66]:
# Get value_counts of eduction column
ccinfo_df['education'].value_counts()

secondary    2267
primary      1862
post-grad     822
other          48
Name: education, dtype: int64

In [67]:
# Transform the education column using get_dummies
edu_dummies = pd.get_dummies(ccinfo_df['education'])

# Display the transformed data
edu_dummies.tail()

Unnamed: 0,other,post-grad,primary,secondary
4994,0,0,0,1
4995,1,0,0,0
4996,0,0,0,1
4997,0,1,0,0
4998,0,0,1,0


In [68]:
ccinfo_df.head()

Unnamed: 0,limit_bal,education,marriage,age,bill_amt,pay_amt,default
0,20000,secondary,yes,24,7704,689,1
1,120000,secondary,no,26,17077,5000,1
2,90000,secondary,no,34,101653,11018,0
3,50000,secondary,yes,37,231334,8388,0
4,50000,secondary,yes,57,109339,59049,0


In [69]:
# Concatenate the df_shopping_transformed and the card_dummies DataFrames
ccinfo_df = pd.concat([ccinfo_df,edu_dummies],axis=1)

# Drop the original education column
ccinfo_df = ccinfo_df.drop(columns=['education'])

# Display the DataFrame
ccinfo_df.head()

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary
0,20000,yes,24,7704,689,1,0,0,0,1
1,120000,no,26,17077,5000,1,0,0,0,1
2,90000,no,34,101653,11018,0,0,0,0,1
3,50000,yes,37,231334,8388,0,0,0,0,1
4,50000,yes,57,109339,59049,0,0,0,0,1


## Transform "marriage" column with encoding function

In [70]:
# Encoding the marriage column using a custom function
def encode_marriage(value):
    if value == 'yes':
        return 1
    else:
        return 0

# Call the encode_marriage function on the marriage column
ccinfo_df['marriage'] = ccinfo_df['marriage'].apply(encode_marriage)

# Review the DataFrame 
ccinfo_df.head()

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary
0,20000,1,24,7704,689,1,0,0,0,1
1,120000,0,26,17077,5000,1,0,0,0,1
2,90000,0,34,101653,11018,0,0,0,0,1
3,50000,1,37,231334,8388,0,0,0,0,1
4,50000,1,57,109339,59049,0,0,0,0,1


## Apply the Standard Scaler to "limit_bal", "bill_amt", "pay_amt"

In [71]:
# Import the module
from sklearn.preprocessing import StandardScaler

In [72]:
# Scaling the numeric columns
scaled_data = StandardScaler().fit_transform(ccinfo_df[['limit_bal','bill_amt','pay_amt']])

# Review the scaled data
scaled_data

array([[-1.1173411 , -0.66070266, -0.5427793 ],
       [-0.3499424 , -0.63637003, -0.46399421],
       [-0.58016201, -0.41680786, -0.35401308],
       ...,
       [ 0.26397655,  1.1152494 , -0.16349243],
       [ 1.10811512,  3.33813208,  0.76045505],
       [-0.04298292, -0.66917611, -0.4872953 ]])

In [73]:
# Create a DataFrame of the scaled data
scaled_df = pd.DataFrame(scaled_data,columns=['limit_bal','bill_amt','pay_amt'])

# Replace the original data with the columns of information from the scaled Data
ccinfo_df['limit_bal'] = scaled_df['limit_bal']
ccinfo_df['bill_amt'] = scaled_df['bill_amt']
ccinfo_df['pay_amt'] = scaled_df['pay_amt']

# Review the DataFrame
ccinfo_df.head()

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary
0,-1.117341,1,24,-0.660703,-0.542779,1,0,0,0,1
1,-0.349942,0,26,-0.63637,-0.463994,1,0,0,0,1
2,-0.580162,0,34,-0.416808,-0.354013,0,0,0,0,1
3,-0.887121,1,37,-0.080152,-0.402077,0,0,0,0,1
4,-0.887121,1,57,-0.396855,0.523771,0,0,0,0,1


## Elbow Method to find k

In [74]:
# Import the KMeans module from SKLearn
from sklearn.cluster import KMeans

In [75]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1,11))

In [76]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    model = KMeans(n_clusters=i,random_state=0)
    model.fit(ccinfo_df)
    inertia.append(model.inertia_)

In [77]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {'k':k,'inertia':inertia}
elbow_df = pd.DataFrame(elbow_data)

# Review the DataFrame
elbow_df.head()

Unnamed: 0,k,inertia
0,1,449413.376075
1,2,151307.029625
2,3,83260.050502
3,4,58169.904941
4,5,45604.61411


In [80]:
# Plot the DataFrame
elbow_df.hvplot.line(
    x='k',
    y='inertia',
    title='Elbow graph',
    xticks=k
)

## Kmeans algo to cluster data

In [84]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3,random_state=1)
# Fit the model
model.fit(ccinfo_df)
# Make predictions
defaults = model.predict(ccinfo_df)
# Create a copy of the preprocessed data
ccinfo_predictions_df = ccinfo_df.copy()
# Add a class column with the labels
ccinfo_predictions_df['segment'] = defaults

In [86]:
# Plot the clusters
ccinfo_predictions_df.hvplot.scatter(
    y='limit_bal',
    x='age',
    by='segment',
    title='Credit profile clusters'
)