In [4]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [2]:
df_iris = pd.read_csv('resources/new_iris_data.csv')

df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


## Elbow Curve with known number of clusters
We'll start with looking at the iris data, since we know the end results should say there are 3 clusters

In [3]:
# start by making an empty list to hold inertia and store a range of K values we want to test

inertia = []

k = list(range(1,11))

In [5]:
# looking for the best K

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris)
    inertia.append(km.inertia_)

  "KMeans is known to have a memory leak on Windows "


In [6]:
# define a dataframe to plot the elbow curve using hvPlot

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

# we're looking for the point that acts as the "elbow". ie, where the line shifts to a strong horizontal line after decreasing a lot
# 3 is our answer here

## Elbow Curve with unknown number of clusters

now we'll look at the shopping data to see how this graph looks

In [7]:
df_shopping = pd.read_csv('resources/shopping_data_cleaned.csv')

df_shopping.head()

Unnamed: 0,CardMember,Age,Annual_Inc,SpendingScore
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [8]:
inertia = []
k = list(range(1,11))

In [9]:
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_shopping)
    inertia.append(km.inertia_)

  "KMeans is known to have a memory leak on Windows "


In [14]:
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve", width=700)

# here the answer isn't immediately obvious, either point 5 or 6 could be argued as having a strong leveling out after it
# this doesn't always give us the final answer, but now at least we've narrowed out choices down to two: 5 groups or 6

### Choosing between 5 or 6 clusters

We'll make dfs with each amount of clusters then check them out in 2d and 3d to see what aligns best with our purposes

In [15]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

In [16]:
five_clusters = get_clusters(5, df_shopping)
six_clusters = get_clusters(6, df_shopping)

### Five Clusters

In [19]:
five_clusters.hvplot.scatter(x="Annual_Inc", y="SpendingScore", by="class")

In [20]:
fig = px.scatter_3d(
    five_clusters,
    x="Age",
    y="SpendingScore",
    z="Annual_Inc",
    color="class",
    symbol="class",
    width=800,
    height=600
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

### Six Clusters

In [21]:
six_clusters.hvplot.scatter(x="Annual_Inc", y="SpendingScore", by="class")

In [22]:
fig = px.scatter_3d(
    six_clusters,
    x="Age",
    y="SpendingScore",
    z="Annual_Inc",
    color="class",
    symbol="class",
    width=800,
    height=600
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

## Conclusion

Most likely we'll choose the 6 clusters. You can see how it has the middling spending scores and middling annual income split by age. Could help make decisions for marketing to them. But that's not a guaranteed answer. Depending on what we were doing, five clusters could also have been a completely valid choice. It all depends on what you're trying to accomplish.