In [51]:
# Required imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans

## Load the Data Into a Pandas DataFrame

In [52]:
# Read in the CSV file as a Pandas Dataframe
ccinfo_default_df = pd.read_csv(
    Path("../Resources/ccinfo_transformed_no_dummies_only_cust_segm.csv")
)

ccinfo_default_df.head()

Unnamed: 0,limit_bal,age,bill_amt,pay_amt,customer_segments
0,-1.117341,24,-0.660703,-0.542779,2
1,-0.349942,26,-0.63637,-0.463994,2
2,-0.580162,34,-0.416808,-0.354013,0
3,-0.887121,37,-0.080152,-0.402077,0
4,-0.887121,57,-0.396855,0.523771,1


In [53]:
# Plot the clusters using the "limit_bal" and "age" columns
ccinfo_default_df.hvplot.scatter(
    x="limit_bal",
    y="age",
    by="customer_segments"
)

In [54]:
# Plot the clusters using the "bill_amt" and "pay_amt" columns
ccinfo_default_df.hvplot.scatter(
    x="bill_amt",
    y="pay_amt",
    by="customer_segments"
)

## Use PCA to reduce the number of factors 

In [55]:
# Remove the Customer Segments variable, add inplace=True
ccinfo_default_df.drop(columns=['customer_segments'],inplace=True)

In [56]:
# Import Stadardscaler from sklearn.preprocessing
from sklearn.preprocessing import StandardScaler

In [57]:
# Use the StandardScaler module and fit_transform function to 
# scale all columns with numerical values
ccinfo_default_list = StandardScaler().fit_transform(ccinfo_default_df)

In [58]:
# Create a DataFrame called with the scaled data
# The column names should match those referenced in the StandardScaler step
ccinfo_default_df_scaled = pd.DataFrame(ccinfo_default_list,columns=["limit_bal", "age", "bill_amt", "pay_amt"])
ccinfo_default_df_scaled.head()

Unnamed: 0,limit_bal,age,bill_amt,pay_amt
0,-1.117341,-1.226792,-0.660703,-0.542779
1,-0.349942,-1.010942,-0.63637,-0.463994
2,-0.580162,-0.147541,-0.416808,-0.354013
3,-0.887121,0.176234,-0.080152,-0.402077
4,-0.887121,2.334734,-0.396855,0.523771


In [59]:
# Import the PCA module
from sklearn.decomposition import PCA

In [60]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=2)

In [61]:
# Fit the PCA model on the transformed credit card DataFrame
ccinfo_pca = pca.fit_transform(ccinfo_default_df)
# Review the first 5 rows of list data
ccinfo_pca[:5]

array([[-11.39374532,  -1.11578252],
       [ -9.37955678,  -0.65786695],
       [ -1.38301701,  -0.75048925],
       [  1.61282554,  -0.82015925],
       [ 21.61349307,  -0.82424692]])

## PCA explained variance ratio

In [62]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.96665303, 0.01914715])

## Creating the PCA DataFrame

In [64]:
# Create the PCA DataFrame
ccinfo_pca_df = pd.DataFrame(ccinfo_pca,columns=['PCA1','PCA2'])

# Review the PCA DataFrame
ccinfo_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-11.393745,-1.115783
1,-9.379557,-0.657867
2,-1.383017,-0.750489
3,1.612826,-0.820159
4,21.613493,-0.824247


## Incorporating the PCA DataFrame into the elbow method

In [65]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1,11))

# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(ccinfo_pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_df = pd.DataFrame({'k':k,'inertia':inertia})

# Review the DataFrame
elbow_df.head()

  super()._check_params_vs_input(X, default_n_init=10)
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,k,inertia
0,1,437869.195089
1,2,140019.107029
2,3,72111.650387
3,4,47177.912155
4,5,34740.924849


In [66]:
# Plot the Elbow Curve
elbow_df.hvplot.line(
    x='k',
    y='inertia',
    xticks=k,
    title='KMeans elbow graph'
)

## Segmention of the PCA data with Kmeans 

In [71]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3)
# Fit the model
model.fit(ccinfo_pca_df)
# Make predictions
pca_predictions = model.predict(ccinfo_pca_df)
# Create a copy of the PCA DataFrame
ccinfo_pca_predictions_df = ccinfo_pca_df.copy()
# Add a class column with the labels
ccinfo_pca_predictions_df['segment'] = pca_predictions

  super()._check_params_vs_input(X, default_n_init=10)


In [70]:
# Plot the clusters
ccinfo_pca_predictions_df.hvplot.scatter(
    x='PCA1',
    y='PCA2',
    by='segment',
    title='KMeans PCA scatter'
)