In [16]:
#import dependencies
import pandas
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [17]:
#create df
iris_df = pandas.read_csv("../resources/new_iris_data.csv")

In [18]:
#inspect data
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [19]:
#Standardize data with StandardScaler
iris_scaled = StandardScaler().fit_transform(iris_df)
print(iris_scaled[0:5])

[[-0.90068117  1.03205722 -1.3412724  -1.31297673]
 [-1.14301691 -0.1249576  -1.3412724  -1.31297673]
 [-1.38535265  0.33784833 -1.39813811 -1.31297673]
 [-1.50652052  0.10644536 -1.2844067  -1.31297673]
 [-1.02184904  1.26346019 -1.3412724  -1.31297673]]


In [20]:
#initialize PCA model
pca = PCA(n_components = 2)

In [21]:
#get two principal components for the iris data
iris_pca = pca.fit_transform(iris_scaled)

In [22]:
#transform PCA data to a DataFrame
iris_pca_df = pandas.DataFrame(
    data = iris_pca,
    columns = ["principal component 1", "principal component 2"]
)
iris_pca_df.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [23]:
# fetch the explained variance
pca.explained_variance_ratio_
'''
What this tells us is that the first principal component contains 72.77 % of the variance 
and the second contains 23.03 %. Together they contain 95.80 % of the information
'''

'\nWhat this tells us is that the first principal component contains 72.77 % of the variance \nand the second contains 23.03 %. Together they contain 95.80 % of the information\n'

In [24]:
#find the best value for K

#create k from range function
k = list(range(1,11))

#create inertia with k loop
inertia = [KMeans(n_clusters=i, random_state=0).fit(iris_pca_df).inertia_ for i in k]

#create elbow curve

pandas.DataFrame({
    "k":k,
    "inertia":inertia,
}).hvplot.line(
    x = "k",
    y = "inertia",
    xticks = k,
    title = "Elbow Curve"
)

In [25]:
#initialize the K-means model
model = KMeans(
    n_clusters = 3,
    random_state = 0
)
# fit the model
model.fit(iris_pca_df)

#predict clusters and save as class column
iris_pca_df["class"] = model.predict(iris_pca_df)

#inspect df
iris_pca_df.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,1
1,-2.086426,-0.655405,1
2,-2.36795,-0.318477,1
3,-2.304197,-0.575368,1
4,-2.388777,0.674767,1


In [26]:
#plot the data
iris_pca_df.hvplot.scatter(
    x = "principal component 1",
    y = "principal component 2",
    hover_cols = ["class"],
    by = "class"
)