In [2]:
# Import the modules
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [3]:
# Read in the CSV file as a Pandas DataFrame
spread_df = pd.read_csv(
    Path("../Resources/stock_data.csv"),
    index_col="date", 
    parse_dates=True, 
    infer_datetime_format=True
)

# Review the DataFrame
spread_df.head()

Unnamed: 0_level_0,close,volume,open,high,low,returns,hi_low_spread
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-04-30,3.61,18193730,3.55,3.73,3.53,0.02849,0.2
2009-05-01,3.82,16233940,3.55,3.9,3.55,0.058172,0.35
2009-05-04,4.26,21236940,3.9,4.3,3.83,0.115183,0.47
2009-05-05,4.32,16369170,4.36,4.39,4.11,0.014085,0.28
2009-05-06,4.31,15075630,4.45,4.45,4.12,-0.002315,0.33


In [32]:
# fitting our data:
scale_volume = StandardScaler()
scale_volume.fit(spread_df)
spread_df_scaled = pd.DataFrame(scale_volume.transform(spread_df), columns=spread_df.columns)

# spread_df_scaled = spread_df.copy()

spread_df_scaled.hvplot.scatter(
    x="hi_low_spread", 
    y="volume", 
)



In [33]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k_list = list(range(1,11))


In [34]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
for i in k_list:
    kmeans = KMeans(n_clusters=i, random_state=1)
    # Fit the model using the spread_df DataFrame
    kmeans.fit(spread_df_scaled[["volume","hi_low_spread"]])
    inertia.append(kmeans.inertia_)
    # Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance

print(k_list)
print(inertia)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[5032.0, 2020.3543286086697, 1222.6420334307827, 905.3804981315091, 723.9503400932938, 619.6821614933701, 539.1943051988112, 483.0703874207922, 426.0142842175709, 386.3792225771971]


In [35]:
# Create a Dictionary that holds the list values for k and inertia
inertia_dict = {"k": k_list, "inertia":inertia}

# Create a DataFrame using the elbow_data Dictionary
inertia_df = pd.DataFrame(inertia_dict)

# Review the DataFrame
inertia_df

Unnamed: 0,k,inertia
0,1,5032.0
1,2,2020.354329
2,3,1222.642033
3,4,905.380498
4,5,723.95034
5,6,619.682161
6,7,539.194305
7,8,483.070387
8,9,426.014284
9,10,386.379223


In [36]:
# Plot the DataFrame
inertia_df.hvplot.line(x="k", y="inertia")

## Perform the following tasks for each of the two most likely values of `k`:

* Define a K-means model using `k` to define the clusters, fit the model, make predictions, and add the prediction values to a copy of the scaled DataFrame and call it `spread_predictions_df`.

* Plot the clusters. The x-axis should reflect the "hi_low_spread", and the y-axis should reflect the "close" price.

In [37]:
# Define the model with the lower value of k clusters
# Use a random_state of 1 to generate the model
kmeans = KMeans(n_clusters=3, random_state=1)

# Fit the model
kmeans.fit(spread_df_scaled[["volume","hi_low_spread"]])

# Make predictions
predictions = kmeans.predict(spread_df_scaled[["volume","hi_low_spread"]])

# Create a copy of the DataFrame and name it as spread_df_predictions
spread_df_predictions = spread_df.copy()

# Add a class column with the labels to the spread_df_predictions DataFrame
spread_df_predictions["ClassPredict"] = predictions


In [38]:
# Plot the clusters
spread_df_predictions.hvplot.scatter(
    x="hi_low_spread", 
    y="volume", 
    by="ClassPredict"
)

In [8]:
# Plot the clusters
spread_df_predictions.hvplot.scatter(
    x="hi_low_spread", 
    y="volume", 
    by="ClassPredict"
)

In [9]:
# Define the model with the higher value of k clusters
# Use a random_state of 1 to generate the model
model = KMeans(n_clusters=4, random_state=1)

# Fit the model
model.fit(spread_df)

# Make predictions
k_higher = model.predict(spread_df)

# Add a class column with the labels to the spread_df_predictions DataFrame
spread_df_predictions['clusters_higher'] = k_higher

In [10]:
# Plot the clusters
spread_df_predictions.hvplot.scatter(
    x="hi_low_spread",
    y="close",
    by="clusters_higher"
).opts(yformatter="%.0f")

## Answer the following question

* Considering the plot, what’s the best number of clusters to choose, or value of k? 

From the scatter plots, it's a little hard to tell given the variability and quantity of the data (zooming in helps), but it appears that the optimal value for k, the nubmer of clusters, is 3.