In [73]:
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [74]:
demo_df = pd.read_csv("../Resources/Demographic_Diabetes_Data.csv", index_col = "Diabetes Status")

demo_df.head(10)

Unnamed: 0_level_0,Unnamed: 0,Sex,Age,Education,Income
Diabetes Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,0,1.0,4.0,6.0,8.0
0.0,1,1.0,12.0,6.0,8.0
0.0,2,1.0,13.0,6.0,8.0
0.0,3,1.0,11.0,6.0,8.0
0.0,4,0.0,8.0,5.0,8.0
0.0,5,0.0,1.0,4.0,7.0
0.0,6,1.0,13.0,5.0,6.0
0.0,7,1.0,6.0,4.0,3.0
0.0,8,0.0,3.0,6.0,8.0
0.0,9,1.0,6.0,4.0,4.0


In [75]:
demo_df.describe()

Unnamed: 0.1,Unnamed: 0,Sex,Age,Education,Income
count,70692.0,70692.0,70692.0,70692.0,70692.0
mean,35345.5,0.456997,8.584055,4.920953,5.698311
std,20407.166952,0.498151,2.852153,1.029081,2.175196
min,0.0,0.0,1.0,1.0,1.0
25%,17672.75,0.0,7.0,4.0,4.0
50%,35345.5,0.0,9.0,5.0,6.0
75%,53018.25,1.0,11.0,6.0,8.0
max,70691.0,1.0,13.0,6.0,8.0


In [76]:
scaled_data = scaled_data = StandardScaler().fit_transform(demo_df)

In [77]:
scaled_df = pd.DataFrame(scaled_data, columns=demo_df.columns)

scaled_df["Diabetes Status"] = demo_df.index

scaled_df = scaled_df.set_index("Diabetes Status")

#Display sample data
scaled_df.head(10)

Unnamed: 0_level_0,Unnamed: 0,Sex,Age,Education,Income
Diabetes Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,-1.732026,1.090046,-1.607237,1.048562,1.05816
0.0,-1.731977,1.090046,1.197681,1.048562,1.05816
0.0,-1.731928,1.090046,1.548296,1.048562,1.05816
0.0,-1.731879,1.090046,0.847066,1.048562,1.05816
0.0,-1.73183,-0.917392,-0.204778,0.076814,1.05816
0.0,-1.731781,-0.917392,-2.659082,-0.894934,0.598428
0.0,-1.731732,1.090046,1.548296,0.076814,0.138696
0.0,-1.731683,1.090046,-0.906008,-0.894934,-1.2405
0.0,-1.731634,-0.917392,-1.957852,1.048562,1.05816
0.0,-1.731585,1.090046,-0.906008,-0.894934,-0.780768


In [79]:
k = range(1,12)

In [80]:
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    model = KMeans(n_clusters=i)
    model.fit(scaled_df)
    inertia.append(model.inertia_)



In [81]:
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

In [82]:
elbow_curve = df_elbow.hvplot.line(
    x = "k",
    y = "inertia",
    title = "Elbow Curve" 
)
elbow_curve

In [83]:
model = KMeans(n_clusters=3, random_state=1)

In [84]:
model.fit(scaled_df)



In [85]:
clusters = model.predict(scaled_df)

clusters

array([2, 2, 2, ..., 0, 0, 0], dtype=int32)

In [86]:
clusters_df = scaled_df.copy()

In [87]:
clusters_df["predicted clusters"] = clusters

# Display sample data
clusters_df.head(10)

Unnamed: 0_level_0,Unnamed: 0,Sex,Age,Education,Income,predicted clusters
Diabetes Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,-1.732026,1.090046,-1.607237,1.048562,1.05816,2
0.0,-1.731977,1.090046,1.197681,1.048562,1.05816,2
0.0,-1.731928,1.090046,1.548296,1.048562,1.05816,2
0.0,-1.731879,1.090046,0.847066,1.048562,1.05816,2
0.0,-1.73183,-0.917392,-0.204778,0.076814,1.05816,1
0.0,-1.731781,-0.917392,-2.659082,-0.894934,0.598428,1
0.0,-1.731732,1.090046,1.548296,0.076814,0.138696,2
0.0,-1.731683,1.090046,-0.906008,-0.894934,-1.2405,2
0.0,-1.731634,-0.917392,-1.957852,1.048562,1.05816,1
0.0,-1.731585,1.090046,-0.906008,-0.894934,-0.780768,2


In [88]:
cluster_plot = clusters_df.hvplot.scatter(
    x = "Sex",
    y = "Age",
    hover_cols = "Diabetes Status",
    color = "predicted clusters"
)
cluster_plot