In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [3]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [11]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=600,
    rot=90
)

---

### Prepare the Data

In [5]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaler = StandardScaler()
# Standardize the data extracted from the CSV file using the scaler
scaled_data = scaler.fit_transform(df_market_data)

In [6]:
# Create a DataFrame with the scaled data
scaled_data_df = pd.DataFrame(scaled_data, columns=df_market_data.columns, index=df_market_data.index)

# Copy the crypto names from the original data
scaled_data_df['coin_id'] = df_market_data.index

# Set the coinid column as index
scaled_data_df.set_index('coin_id', inplace=True)

# Display sample data
scaled_data_df.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

### Find the Best Value for k Using the Original Data.

In [7]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))
print(k_values)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [8]:
# Create an empty list to store the inertia values


# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

inertia_values = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data_df)
    inertia_values.append(kmeans.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    'k': k_values,
    'inertia': inertia_values
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

# Display the DataFrame
df_elbow.head()


Unnamed: 0,k,inertia
0,1,287.0
1,2,195.820218
2,3,123.190482
3,4,79.022435
4,5,65.302379


In [12]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
inertia_plot = pd.DataFrame(inertia_values, columns=['inertia'], index=k_values).hvplot.line(
    x='index',
    y='inertia',
    xlabel='Number of clusters (k)',
    ylabel='Inertia',
    title='Elbow Curve',
    width=800,
    height=600,
    grid=True
)
inertia_plot

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** 

Based on the analysis using the Elbow Method, it is suggested that the dataset is optimally segmented into 4 clusters.

Utilizing the Elbow Method entails plotting the inertia values against the number of clusters. Throughout this examination, inertia steadily decreased with an increase in the number of clusters. However, beyond k=4, this decline in inertia slowed, resulting in a more gradual decrease. This pivotal point, where the rate of decrease alters notably, is commonly referred to as the "elbow."

The selection of k=4 signifies a compromise between model complexity (additional clusters) and model performance (lower inertia). Post k=4, the diminishing returns in inertia reduction imply that additional clusters may not significantly enhance the clustering accuracy. Hence, k=4 marks a balance where the clustering effectiveness is reasonably optimized without overly complexifying the model.

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [13]:
# Initialize the K-Means model using the best value for k
kmeans = KMeans(n_clusters=4, random_state=42)

In [14]:
# Fit the K-Means model using the scaled data
kmeans.fit(scaled_data_df)

  super()._check_params_vs_input(X, default_n_init=10)


In [16]:
# Predict the clusters to group the cryptocurrencies using the scaled data
clusters = kmeans.predict(scaled_data_df)

# Print the resulting array of cluster values.
print(clusters)

[0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 3 0 2 2 1
 2 2 2 2]


In [21]:
# Create a copy of the DataFrame
clustered_df = scaled_data_df.copy()

In [22]:
# Add a new column to the DataFrame with the predicted clusters
clustered_df['cluster'] = kmeans.labels_

# Display sample data
clustered_df.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637,0
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352,0
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061,2
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546,2
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317,0


In [23]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

clustered_df.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="cluster",
    hover_cols=["coin_id"],
    width=800,
    height=600
)

---

### Optimize Clusters with Principal Component Analysis.

In [25]:
# Create a PCA model instance and set `n_components=3`.
pca_model = PCA(n_components=3)

In [27]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
principal_components = pca_model.fit_transform(scaled_data_df)

# View the first five rows of the DataFrame. 
principal_df = pd.DataFrame(data = principal_components, columns = ['Principal Component 1', 'Principal Component 2', 'Principal Component 3'], index=scaled_data_df.index)
principal_df.head()

Unnamed: 0_level_0,Principal Component 1,Principal Component 2,Principal Component 3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


In [31]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
variance_explained = pca_model.explained_variance_ratio_

variance_explained_df = pd.DataFrame(
    data=variance_explained,
    columns=['Variance Explained'],
    index=['Principal Component 1', 'Principal Component 2', 'Principal Component 3']
)

# Display the explained variance DataFrame
variance_explained_df

Unnamed: 0,Variance Explained
Principal Component 1,0.371986
Principal Component 2,0.347008
Principal Component 3,0.176038


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** 

The cumulative explained variance of the three principal components aggregates the individual contributions of each component. Adding up the explained variances of Principal Component1, Principal Component2, and Principal Component3 yields the total explained variance. For instance, if we substitute the specific values from our analysis:

Total Explained Variance = Explained Variance of Principal Component1 + Explained Variance of Principal Component2 + Explained Variance of Principal Component3

By plugging in the numbers:
Total Explained Variance = 0.371986 + 0.347008 + 0.176038

We find:
Total Explained Variance ≈ 0.895

Hence, the total explained variance of approximately 0.895 denotes that these three principal components jointly clarify about 89.5% of the overall variance in the dataset.

Expressed differently, a total explained variance nearing 0.895 suggests that Principal Component1, Principal Component2, and Principal Component3 collectively encapsulate a significant portion of the dataset's variability.

This high cumulative explained variance indicates that these principal components effectively condense and encapsulate the dataset's information into a lower-dimensional representation. Such compression facilitates easier data comprehension and visualization while retaining a substantial amount of the original variance.

In [34]:
# Create a new DataFrame with the PCA data.
pca_df = pd.DataFrame(principal_components, columns=['Principal Component 1', 'Principal Component 2', 'Principal Component 3'], index=scaled_data_df.index)

# Copy the crypto names from the original data
pca_df['coin_id'] = scaled_data_df.index

# Set the coinid column as index
pca_df.set_index('coin_id', inplace=True)

# Display sample data
pca_df.head()

Unnamed: 0_level_0,Principal Component 1,Principal Component 2,Principal Component 3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the PCA Data

In [21]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))

In [35]:
# Create an empty list to store the inertia values
inertia_values_pca = []

# Create a for loop to compute the inertia with each possible value of k
for k in k_values:
# Inside the loop:
    # 1. Create a KMeans model using the loop counter for the n_clusters
    kmeans_pca = KMeans(n_clusters=k, random_state=42)
    # 2. Fit the model to the data using `df_market_data_pca`
    kmeans_pca.fit(pca_df)
    # 3. Append the model.inertia_ to the inertia list
    inertia_values_pca.append(kmeans_pca.inertia_)

# Display the inertia values
inertia_values_pca

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


[256.87408556789256,
 165.9019940203601,
 93.77462568057295,
 49.66549665179736,
 37.83946598681242,
 30.77774614144119,
 21.134056037473606,
 17.091636643864742,
 13.681139692992751,
 10.630647573870965,
 8.128620320213491]

In [36]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {
    'k': k_values,
    'inertia': inertia_values_pca
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data_pca)

# Display the DataFrame
df_elbow_pca

Unnamed: 0,k,inertia
0,1,256.874086
1,2,165.901994
2,3,93.774626
3,4,49.665497
4,5,37.839466
5,6,30.777746
6,7,21.134056
7,8,17.091637
8,9,13.68114
9,10,10.630648


In [37]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

inertia_plot_pca = df_elbow_pca.hvplot.line(
    x='k',
    y='inertia',
    xlabel='Number of Principal Components (k)',
    ylabel='Inertia',
    title='Inertia (PCA)',
    width=800,
    height=600,
    grid=True
)

# Display the plot
inertia_plot_pca

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:**
    The optimal number of clusters (k) when utilizing the PCA data is also 4. This determination was made by analyzing the inertia values computed for various values of k using the Elbow Method. By plotting the inertia values against the number of clusters, the Elbow Method helps identify the point where adding more clusters does not significantly improve the clustering quality. In this case, the inertia values showed a prominent decrease up to k=4, suggesting that four clusters provide a suitable balance between model complexity and clustering performance.

* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** 
    No, the optimal k value obtained using the PCA data (4) is not different from the best k value obtained using the original dataset. Both analyses, conducted using the Elbow Method, converged on k=4 as the optimal number of clusters for this dataset. This consistency indicates that the underlying clustering structure of the data, whether represented by the original features or the principal components obtained through PCA, is best characterized by four clusters.

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [38]:
# Initialize the K-Means model using the best value for k
kmeans_pca = KMeans(n_clusters=4, random_state=42)

In [39]:
# Fit the K-Means model using the PCA data
kmeans_pca.fit(pca_df)

  super()._check_params_vs_input(X, default_n_init=10)


In [40]:
# Predict the clusters to group the cryptocurrencies using the PCA data
clusters_pca = kmeans_pca.predict(pca_df)

# Print the resulting array of cluster values.
print(clusters_pca)

[0 0 2 2 0 0 0 0 0 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 3 0 2 2 1
 2 2 2 2]


In [43]:
# Create a copy of the DataFrame with the PCA data
pca_df_clustered = pca_df.copy()

# Add a new column to the DataFrame with the predicted clusters
pca_df_clustered['cluster'] = clusters_pca

# Display sample data
pca_df_clustered.head()

Unnamed: 0_level_0,Principal Component 1,Principal Component 2,Principal Component 3,cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,0
ethereum,-0.458261,0.458466,0.952877,0
tether,-0.43307,-0.168126,-0.641752,2
ripple,-0.471835,-0.22266,-0.479053,2
bitcoin-cash,-1.1578,2.041209,1.859715,0


In [46]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

# Create a scatter plot using hvPlot
scatter_plot_pca = pca_df_clustered.hvplot.scatter(
    x='Principal Component 1',
    y='Principal Component 2',
    by='cluster',
    hover_cols=['coin_id'],
    title='Clustered Cryptocurrencies (PCA Data)',
    xlabel='Principal Component 1',
    ylabel='Principal Component 2',
    width=800,
    height=600
)

# Display the plot
scatter_plot_pca

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [47]:
# Composite plot to contrast the Elbow curves
elbow_plot_composite = inertia_plot + inertia_plot_pca
elbow_plot_composite

In [50]:
# Composite plot to contrast the clusters
# YOUR CODE HERE!

# Create a scatter plot using hvPlot for the original data
scatter_plot_original = clustered_df.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="cluster",
    hover_cols=["coin_id"],
    width=800,
    height=500
)
# Create a scatter plot using hvPlot for the PCA data
scatter_plot_pca = pca_df_clustered.hvplot.scatter(
    x='Principal Component 1',
    y='Principal Component 2',
    by='cluster',
    hover_cols=['coin_id'],
    title='Clustered Cryptocurrencies (PCA Data)',
    xlabel='Principal Component 1',
    ylabel='Principal Component 2',
    width=800,
    height=500
)

# Composite plot to contrast the clusters
cluster_plot_composite = scatter_plot_original + scatter_plot_pca

# Display the composite plot
cluster_plot_composite

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?


* **Answer:** 

Upon visual inspection of the cluster analysis outcomes for both the initial dataset and the PCA-transformed data, several noteworthy observations emerge regarding the ramifications of utilizing fewer features for K-Means clustering:

Cluster Dispersion: In the primary dataset, the clusters exhibit a more extensive dispersion, suggesting a broader array of cryptocurrencies based on their percentage changes in price over 24 hours and 7 days. This implies that directly employing these two features leads to more dispersed clusters.

Cluster Compactness: The PCA-transformed data displays higher cluster compactness, particularly for clusters 2 and 0, indicating that cryptocurrencies within these clusters share greater similarity in their price change behaviors. This implies that PCA successfully captures the underlying data patterns, resulting in more tightly-knit clusters.

Visualization Clarity: The PCA-transformed data offers a clearer visualization of the clusters, featuring distinct groupings that are more easily discernible compared to the original dataset. This indicates that utilizing PCA to reduce data dimensionality can yield more interpretable and actionable insights.

Dimensionality Reduction: Overall, employing fewer features (specifically, principal components) for K-Means clustering simplifies the analysis while retaining much of the underlying data structure. This can be advantageous for tasks prioritizing interpretability and simplicity.

In summary, the consequences of using fewer features (principal components) for K-Means clustering manifest in the creation of more compact and interpretable clusters, as evident in the PCA-transformed data vis-a-vis the original dataset. This underscores the value of dimensionality reduction techniques like PCA in simplifying intricate datasets for clustering analyses.