In [156]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import set_config
set_config(display='diagram')
# For plotting
import plotly.io as plt_io
import plotly.graph_objects as go
%matplotlib inline

In [None]:
#read in data
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vQGG95zRf7Hmos7Gx7VqpJmksOos3bgxr73KYfmc8soEnvk_L4rVcNPcUHDpmNMDnRyof6UPlm-DTEp/pub?gid=1011669702&single=true&output=csv')
df.head()

# using PCA

In [106]:
df['Gender'].replace(['Male', 'Female','Nan'],
                        [0, 1,2], inplace=True)

In [107]:
# scale the data
df_scaled = StandardScaler().fit_transform(df)
# pca
pca = PCA(n_components=3)
components = pca.fit_transform(df_scaled)
components.shape

In [120]:
# create dataframe with principle component
principal = pd.DataFrame(data = components
             , columns = ['principal component 1', 'principal component 2','principal component 3'])#,'principal component 4'])

In [152]:
# Function for 2D plot
def plot_2d(component1, component2):
    
    fig = go.Figure(data=go.Scatter(
        x = component1,
        y = component2,
        mode='markers',
        marker=dict(
            size=10,
            color=df['cluster'], #set color equal to a variable
            colorscale='Rainbow', # one of plotly colorscales
            showscale=True,
            line_width=1
        )
    ))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),width=900,height=500)                 
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [153]:
# Function for 3D plot
def plot_3d(component1,component2,component3):
    fig = go.Figure(data=[go.Scatter3d(
        x=component1,
        y=component2,
        z=component3,
        mode='markers',
        marker=dict(
            size=7,
            color=df['cluster'],                # set color to an array/list of desired values
            colorscale='Rainbow',   # choose a colorscale
            opacity=1,
            line_width=1
        )
    )])
  # tight layout
    fig.update_layout(margin=dict(l=50,r=50,b=50,t=50),width=900,height=500)
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [123]:
plot_2d(components[:, 0],components[:, 1])

In [124]:
plot_3d(components[:, 0],components[:, 1],components[:, 2])

# PCA after feature extraction



In [None]:
# define the columns you want to use (X is fine, but remember there isn't an X and y)

df.info()

### Remember:
Do NOT train/test split

In [129]:
x = df[['Annual Income (k$)', 'Spending Score (1-100)']]
x.head()

Unnamed: 0,Annual Income (k$),Spending Score (1-100)
0,15,39
1,15,81
2,16,6
3,16,77
4,17,40


In [130]:
# scale the data
x_scaled = StandardScaler().fit_transform(x)

In [145]:
# define a range of values of k to evaluate for silhouette score, fit and run the model, and plot the results
ks = range(2,11)

sils = []
inertias = []

for k in ks:
  model = KMeans(n_clusters=k)
  model.fit(x_scaled)
  sils.append(silhouette_score(x_scaled, model.labels_))
  inertias.append(model.inertia_)

# plt.plot(ks, sils)
# plt.xlabel('Number of clusters')
# plt.ylabel('silhouette scores')
# plt.xticks(ks)
# plt.show()

### Plot with two different y-axis with twin plot in Python



In [None]:

fig,ax = plt.subplots(figsize=(10,8))
ax.plot(ks,inertias,marker = '*',color='red',label='Inertia')
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Inertia',fontsize=12)
ax.legend()
ax2 =ax.twinx()
ax2.plot(ks,sils,marker='+',color='blue',label='Silhouette Score')
ax2.set_ylabel('Silhouette Score',fontsize=12)
ax2.legend(loc='upper left')
plt.show();


## Final Model and Clustering the Dataset

In [134]:
# instantiate a Kmeans model with the value for k based on elbow plot method and silhouette score
kmeans = KMeans(n_clusters=5)
kmeans.fit(x_scaled)
kmeans

In [135]:
# add a column to the dataframe to add the cluster label as you fit and predict x 
df['cluster'] = kmeans.labels_


In [None]:
# examine your dataframe with .head()
df.head()

## PCA with cluster

In [137]:
# scale the data
df_scaled = StandardScaler().fit_transform(df)

In [141]:
pca = PCA(n_components=3)
components = pca.fit_transform(df_scaled)


In [142]:
principal = pd.DataFrame(data = components
             , columns = ['principal component 1', 'principal component 2','principal component 3'])#,'principal component 4'])

In [155]:
plot_2d(components[:, 0],components[:, 1])

In [154]:
plot_3d(components[:, 0],components[:, 1],components[:, 2])