In [70]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [71]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")
df_market_data.reset_index(inplace=True)

# Display sample data
df_market_data.head(10)

Unnamed: 0,coin_id,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
0,bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
1,ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
2,tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
3,ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
4,bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
5,binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
6,chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
7,cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
8,litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
9,bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [72]:
print(df_market_data.columns)

Index(['coin_id', 'price_change_percentage_24h', 'price_change_percentage_7d',
       'price_change_percentage_14d', 'price_change_percentage_30d',
       'price_change_percentage_60d', 'price_change_percentage_200d',
       'price_change_percentage_1y'],
      dtype='object')


In [73]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [74]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [75]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaler = StandardScaler()

In [76]:
# Create a DataFrame with the scaled data
df_scaled = pd.DataFrame(df_market_data, columns=df_market_data.columns)

# Copy the crypto names from the original DataFrame
df_scaled['coin_id'] = df_market_data.index

# Set the coin_id column as index
#df_scaled.set_index('coin_id', inplace=True)

# Display the scaled DataFrame
print(df_scaled.head())

   coin_id  price_change_percentage_24h  price_change_percentage_7d  \
0        0                      1.08388                     7.60278   
1        1                      0.22392                    10.38134   
2        2                     -0.21173                     0.04935   
3        3                     -0.37819                    -0.60926   
4        4                      2.90585                    17.09717   

   price_change_percentage_14d  price_change_percentage_30d  \
0                      6.57509                      7.67258   
1                      4.80849                      0.13169   
2                      0.00640                     -0.04237   
3                      2.24984                      0.23455   
4                     14.75334                     15.74903   

   price_change_percentage_60d  price_change_percentage_200d  \
0                     -3.25185                      83.51840   
1                    -12.88890                     186.77418   
2 

In [77]:
print(df_scaled.columns)

Index(['coin_id', 'price_change_percentage_24h', 'price_change_percentage_7d',
       'price_change_percentage_14d', 'price_change_percentage_30d',
       'price_change_percentage_60d', 'price_change_percentage_200d',
       'price_change_percentage_1y'],
      dtype='object')


---

### Find the Best Value for k Using the Original Scaled DataFrame.

In [78]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12))


In [79]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
for i in k_values:

    k_model = KMeans(n_clusters=i, random_state=0)
    # 2. Fit the model to the data using `df_market_data_scaled`
    k_model.fit(df_scaled)
    # 3. Append the model.inertia_ to the inertia list
    inertia.append(k_model.inertia_)
print(inertia)

[69989281.80285287, 8198795.993539452, 2598026.779989077, 840027.7674758405, 470515.727957122, 347817.03348777705, 217243.79480309473, 179810.33400848915, 135002.7763754148, 110216.82615983953, 84417.61261301738]


In [80]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k_values, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_1 = pd.DataFrame(elbow_data)

In [81]:
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
elbow_curve_plot = df_elbow_1.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=list(range(1, 12)),  # Adjust this based on your k values
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    line_color='blue'
)
elbow_curve_plot

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** The best value for k is 2.


---

### Cluster Cryptocurrencies with K-means Using the Original Scaled DataFrame

In [82]:
# Initialize the K-Means model using the best value for k
best_k = 2
kmeans_model = KMeans(n_clusters=best_k, random_state=0)

In [83]:
# Fit the K-Means model using the scaled DataFrame
kmeans_model.fit(df_scaled)

In [84]:
# Predict the clusters to group the cryptocurrencies using the scaled DataFrame
cluster_predictions = kmeans_model.predict(df_scaled)

# Print the resulting array of cluster values.
print(cluster_predictions)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0]


In [85]:
# Create a copy of the scaled DataFrame
df_scaled_copy = df_scaled.copy()

In [86]:
# Add a new column to the copy of the scaled DataFrame with the predicted clusters
df_scaled_copy['predicted_clusters'] = cluster_predictions

# Display the copy of the scaled DataFrame
print(df_scaled_copy)

    coin_id  price_change_percentage_24h  price_change_percentage_7d  \
0         0                      1.08388                     7.60278   
1         1                      0.22392                    10.38134   
2         2                     -0.21173                     0.04935   
3         3                     -0.37819                    -0.60926   
4         4                      2.90585                    17.09717   
5         5                      2.10423                    12.85511   
6         6                     -0.23935                    20.69459   
7         7                      0.00322                    13.99302   
8         8                     -0.06341                     6.60221   
9         9                      0.92530                     3.29641   
10       10                      0.61209                    -5.67151   
11       11                     -0.17825                    -0.11871   
12       12                      0.14477                    -1.3

In [87]:
# Create a scatter plot using hvPlot by setting
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
scatter_plot = df_scaled_copy.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="predicted_clusters",  # Color by the predicted clusters
    hover_cols=["coin_id"],  # Replace "coin_id" with the actual column name for the cryptocurrency name
    title="Cryptocurrency Clusters",
    xlabel="Price Change Percentage (24h)",
    ylabel="Price Change Percentage (7d)"
)
scatter_plot

---

### Optimize Clusters with Principal Component Analysis.

In [88]:
# Create a PCA model instance and set `n_components=3`.
pca_model = PCA(n_components=3)

In [89]:
# Use the PCA model with `fit_transform` to reduce the original scaled DataFrame
pca_data = pca_model.fit_transform(df_scaled)
# down to three principal components.
df_pca = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2', 'PC3'])

# View the scaled PCA data
print(df_pca.head())

          PC1         PC2        PC3
0 -341.845510  -51.676202  13.090751
1 -249.462913   23.811240 -13.639942
2 -402.654634 -118.975739  25.280594
3 -406.790035  -79.740764   2.018074
4 -382.465363 -103.667370  17.187842


In [90]:
# Retrieve the explained variance to determine how much information
explained_variance = pca_model.explained_variance_ratio_
# can be attributed to each principal component.
print("Explained Variance Ratio for each Principal Component:")
print(explained_variance)


Explained Variance Ratio for each Principal Component:
[9.75962290e-01 2.30326229e-02 7.48875501e-04]


In [91]:
total_explained_variance = explained_variance.sum()
print(total_explained_variance)

0.9997437882101878


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** Calculating the sum of the total explained variance we end with 0.99.

In [92]:
print(df_scaled.columns)

Index(['coin_id', 'price_change_percentage_24h', 'price_change_percentage_7d',
       'price_change_percentage_14d', 'price_change_percentage_30d',
       'price_change_percentage_60d', 'price_change_percentage_200d',
       'price_change_percentage_1y'],
      dtype='object')


In [93]:
# Create a new DataFrame with the PCA data.
df_pca = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2', 'PC3'])

# Copy the crypto names from the original scaled DataFrame
df_pca['coin_id'] = df_scaled['coin_id'].values

# Set the coin_id column as index
df_pca.set_index('coin_id', inplace=True)

# Display the scaled PCA DataFrame
print(df_pca.head())

                PC1         PC2        PC3
coin_id                                   
0       -341.845510  -51.676202  13.090751
1       -249.462913   23.811240 -13.639942
2       -402.654634 -118.975739  25.280594
3       -406.790035  -79.740764   2.018074
4       -382.465363 -103.667370  17.187842


---

### Find the Best Value for k Using the Scaled PCA DataFrame

In [94]:
df_market_data_pca = df_pca

In [95]:
print(df_market_data_pca.columns)

Index(['PC1', 'PC2', 'PC3'], dtype='object')


In [96]:
# Create a list with the number of k-values from 1 to 11
k_model = list(range(1, 12))

In [97]:
# Create an empty list to store the inertia values
inertia = []
# Create a for loop to compute the inertia with each possible value of k
for i in k_model:
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
    kmeans_model = KMeans(n_clusters=i, random_state=1)
# 2. Fit the model to the data using `df_market_data_pca`
    kmeans_model.fit(df_market_data_pca)
# 3. Append the model.inertia_ to the inertia list
    inertia.append(kmeans_model.inertia_)

In [98]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    "k": k_model,          
    "inertia": inertia     
}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
print(df_elbow)

     k       inertia
0    1  6.997135e+07
1    2  8.180875e+06
2    3  2.780022e+06
3    4  8.236452e+05
4    5  4.264697e+05
5    6  2.615912e+05
6    7  2.027870e+05
7    8  1.635801e+05
8    9  1.254756e+05
9   10  8.932893e+04
10  11  6.621262e+04


In [99]:
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
line_plot = df_elbow.hvplot.line(
    x='k',
    y='inertia',
    title='Elbow Curve',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',  
)
line_plot

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** The best value for 'k' is 2 when using the PCA data.


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** It does not differ from the original data.

### Cluster Cryptocurrencies with K-means Using the Scaled PCA DataFrame

In [100]:
# Initialize the K-Means model using the best value for k
optimal_k = 2
kmeans_model = KMeans(n_clusters=optimal_k, random_state=1)

In [101]:
# Fit the K-Means model using the PCA data
kmeans_model.fit(df_market_data_pca)

In [102]:
# Predict the clusters to group the cryptocurrencies using the scaled PCA DataFrame
predicted_clusters = kmeans_model.predict(df_market_data_pca)

# Print the resulting array of cluster values.
print(predicted_clusters)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0]


In [103]:
# Create a copy of the scaled PCA DataFrame
df_pca_with_clusters = df_market_data_pca.copy()

# Add a new column to the copy of the PCA DataFrame with the predicted clusters
df_pca_with_clusters['cluster'] = predicted_clusters

# Display the copy of the scaled PCA DataFrame
print(df_pca_with_clusters.head())

                PC1         PC2        PC3  cluster
coin_id                                            
0       -341.845510  -51.676202  13.090751        0
1       -249.462913   23.811240 -13.639942        0
2       -402.654634 -118.975739  25.280594        0
3       -406.790035  -79.740764   2.018074        0
4       -382.465363 -103.667370  17.187842        0


In [104]:
print(df_pca_with_clusters.columns)

Index(['PC1', 'PC2', 'PC3', 'cluster'], dtype='object')


In [105]:
# Create a scatter plot using hvPlot by setting
# `x="PC1"` and `y="PC2"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
scatter_plot = df_pca_with_clusters.hvplot.scatter(
    x='PC1',
    y='PC2',
    c='cluster',
    title='Cryptocurrency Clusters',
    xlabel='Principal Component 1 (PC1)',
    ylabel='Principal Component 2 (PC2)',
    hover_cols=['coin_id']  
)
scatter_plot

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [106]:
# Composite plot to contrast the Elbow curves
# YOUR CODE HERE!
original_elbow_plot = df_elbow_1.hvplot.line(
    x='k',
    y='inertia',
    title='Elbow Method Comparison',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    color='blue',
    label='Original Scaled Data'
)

# Create the plot for the PCA scaled DataFrame
pca_elbow_plot = df_elbow.hvplot.line(
    x='k',
    y='inertia',
    color='red',
    label='PCA Scaled Data'
)

# Combine the two plots into a composite plot
composite_plot = original_elbow_plot + pca_elbow_plot
composite_plot

In [107]:
# Composite plot to contrast the clusters
# YOUR CODE HERE!
original_clusters_plot = df_pca.hvplot.scatter(
    x='PC1',
    y='PC2',
    c='cluster',
    title='Cluster Comparison: Original vs PCA',
    xlabel='Principal Component 1 (PC1)',
    ylabel='Principal Component 2 (PC2)',
    color='blue',
    label='Original Clusters',
    hover_cols=['coin_id']  
)

# Create the plot for the PCA scaled DataFrame clusters
pca_clusters_plot = df_pca_with_clusters.hvplot.scatter(
    x='PC1',
    y='PC2',
    c='cluster',
    color='red',
    label='PCA Clusters',
    hover_cols=['coin_id']  
)
pca_clusters_plot
# Combine the two plots into a composite plot
composite_clusters_plot = original_clusters_plot + pca_clusters_plot

# Display the composite plot
composite_clusters_plot

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** With fewer features, K-means produce more coherent and distinct clusters.