In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering, KMeans
import hvplot.pandas
import plotly.figure_factory as ff

In [20]:
# load and scale daat
df = pd.read_csv('/Users/catherinesmith/Desktop/unc_bootcamp/module_19/UNCBC-HW19-UnsupervisedML/resources/data/iris.csv')
df = df.drop(['class'], axis=1)
scaled_df = StandardScaler().fit_transform(df)

In [21]:
# create model
pca = PCA(n_components=2)

# get top 2 principal components for iris data
iris_pca = pca.fit_transform(scaled_df)

# transform pca results to dataframe
pca_df = pd.DataFrame(
    data = iris_pca,
    columns= ['PC1', 'PC2']
)

# show pca_df
pca_df.head()

Unnamed: 0,PC1,PC2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [22]:
# determine the amount of variance explained by each PC
pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

In [26]:
# detemine optimal number of clusters

# create list of k values
k=list(range(1,11))

# initialize vector to hold inertia values
inertia=[]

# calculate inertia for each k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_df)
    inertia.append(km.inertia_)

# create elbow curve
elbow_df = pd.DataFrame({'k':k, 'inertia': inertia})
elbow_df.hvplot.line(x='k', y='inertia', xticks=k, title='Elbow Curve')

In [27]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(pca_df)

# Predict clusters
predictions = model.predict(pca_df)

# Add the predicted class columns
pca_df["class"] = model.labels_
pca_df.head()

Unnamed: 0,PC1,PC2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [29]:
pca_df.hvplot.scatter(
    x="PC1",
    y="PC2",
    hover_cols=["class"],
    by="class",
)

In [32]:
# Create the dendrogram
fig = ff.create_dendrogram(pca_df, color_threshold=5)
fig.update_layout(width=800, height=500)
fig.show()

In [40]:
agg = AgglomerativeClustering(n_clusters=2)
model = agg.fit(pca_df)

In [41]:
pca_df['class']=model.labels_
pca_df.head()

Unnamed: 0,PC1,PC2,class
0,-2.264542,0.505704,1
1,-2.086426,-0.655405,1
2,-2.36795,-0.318477,1
3,-2.304197,-0.575368,1
4,-2.388777,0.674767,1


In [42]:
pca_df.hvplot.scatter(
    x='PC1',
    y='PC2',
    hover_cols = ['class'],
    by='class'
)