In [None]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
import holoviews as hv
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

In [None]:
# Generate summary statistics
df_market_data.describe()

In [None]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [None]:
# Create columns variable
market_columns= df_market_data.columns

In [None]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
market_scaled = StandardScaler().fit_transform(df_market_data[market_columns])
market_scaled[0:5]

In [None]:
# Create a DataFrame with the scaled data
market_scaled_df = pd.DataFrame(market_scaled, columns = market_columns)

# Copy the crypto names from the original data
market_scaled_df['coin_id'] = df_market_data.index
# Set the coinid column as index
market_scaled_df = market_scaled_df.set_index('coin_id')

# Display sample data
market_scaled_df.head()

---

### Find the Best Value for k Using the Original Data.

In [None]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1,11))

In [None]:
# Create an empty list to store the inertia values
inertia=[]

# Create a for loop to compute the inertia with each possible value of k
for i in k:
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
    k_model = KMeans(n_clusters=i, random_state = 0, n_init = 10)
# 2. Fit the model to the data using `df_market_data_scaled`
    k_model.fit(market_scaled_df)
# 3. Append the model.inertia_ to the inertia list
    inertia.append(k_model.inertia_)

In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_d= {'k': k, 'inertia': inertia}

# Create a DataFrame with the data to plot the Elbow curve
elbow_d_df = pd.DataFrame(elbow_d)
elbow_d_df.head()

In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_d_df.hvplot.line(
x='k',
y='inertia',
xticks = k)

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** looks like 4

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [None]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters = 4, random_state=0, n_init=10)
# Showing other K-Means models
model5 = KMeans(n_clusters = 5, random_state=0, n_init=10)
model3 = KMeans(n_clusters = 3, random_state=0, n_init=10)

In [None]:
# Fit the K-Means models using the scaled data
model.fit(market_scaled_df)
model5.fit(market_scaled_df)
model3.fit(market_scaled_df)

In [None]:
# Predict the clusters to group the cryptocurrencies using the scaled data
k_4 = model.predict(market_scaled_df)
k_5 = model5.predict(market_scaled_df)
k_3 = model3.predict(market_scaled_df)
# Print the resulting arrays of cluster values.
print(k_4)
print(k_5)
print(k_3)

In [None]:
# Create copies of the DataFrame
market_predictions_df = market_scaled_df.copy()
market_predictions5_df = market_scaled_df.copy()
market_predictions3_df = market_scaled_df.copy()

In [None]:
# Add a new column to the DataFrames with the predicted clusters
market_predictions_df['market_clusters'] = k_4
market_predictions5_df['market_clusters'] = k_5
market_predictions3_df['market_clusters'] = k_3
# Display sample data
market_predictions_df.head()
# market_predictions5_df.head()
# market_predictions3_df.head()

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
market_predictions_df.hvplot.scatter(
x = 'price_change_percentage_24h',
y = 'price_change_percentage_7d',
hover_cols = ['coin_id'],
by = 'market_clusters'  
)

In [None]:
# Visual of 5 clusters
# market_predictions5_df.hvplot.scatter(
# x = 'price_change_percentage_24h',
# y = 'price_change_percentage_7d',
# hover_cols = ['coin_id'],
# by = 'market_clusters'  
# )

In [None]:
# Visual of 3 clusters
# market_predictions3_df.hvplot.scatter(
# x = 'price_change_percentage_24h',
# y = 'price_change_percentage_7d',
# hover_cols = ['coin_id'],
# by = 'market_clusters'  
# )

---

### Optimize Clusters with Principal Component Analysis.

In [None]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [None]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
market_pca_data = pca.fit_transform(market_predictions_df)
# View the first five rows of the DataFrame. 
market_pca_data[0:5]

In [None]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
round(pca.explained_variance_ratio_.sum()*100,2)

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** when using our pca model of 3 principal components, we retain 89% of the variance in our data 

In [None]:
# Create a new DataFrame with the PCA data.
market_pca_df = pd.DataFrame(market_pca_data, columns=['PCA1', 'PCA2', 'PCA3'])

# Copy the crypto names from the original data
market_pca_df['coin_id'] = df_market_data.index

# Set the coinid column as index
market_pca_df = market_pca_df.set_index('coin_id')

# Display sample data
market_pca_df.head()

---

### Find the Best Value for k Using the PCA Data

In [None]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1,11))

In [None]:
# Create an empty list to store the inertia values
inertia_pca=[]

# Create a for loop to compute the inertia with each possible value of k
for i in k:
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
    k_model = KMeans(n_clusters = i, random_state=0, n_init=10)
# 2. Fit the model to the data using `df_market_data_pca`
    k_model.fit(market_pca_df)
# 3. Append the model.inertia_ to the inertia list
    inertia_pca.append(k_model.inertia_)


In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_d = {'k':k, 'inertia':inertia_pca}
# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame(elbow_d)

In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_df.hvplot.line(
x='k',
y='inertia',
xticks=k)

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** 4


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** no, but it's definitely more clear where the elbow is

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [None]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters = 4, random_state = 0, n_init = 10)

In [None]:
# Fit the K-Means model using the PCA data
model.fit(market_pca_df)

In [None]:
# Predict the clusters to group the cryptocurrencies using the PCA data
k_4 = model.predict(market_pca_df)
# Print the resulting array of cluster values.
print(k_4)

In [None]:
# Create a copy of the DataFrame with the PCA data
market_predictions_pca = market_pca_df.copy()

# Add a new column to the DataFrame with the predicted clusters
market_predictions_pca['market_clusters'] = k_4

# Display sample data
market_predictions_pca.head()

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
market_predictions_pca.hvplot.scatter(
x='PCA1',
y='PCA2',
hover_cols = ['coin_id'],
by='market_clusters')

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [None]:
# Composite plot to contrast the Elbow curves

plot1 = elbow_d_df.hvplot.line(x='k', y='inertia', xticks=k)
plot2 = elbow_df.hvplot.line(x='k', y='inertia', xticks=k)

hv.Layout(plot1 + plot2).cols(1)

In [None]:
# Composite plot to contrast the clusters
plot3 = market_predictions_df.hvplot.scatter(
x = 'price_change_percentage_24h',
y = 'price_change_percentage_7d',
hover_cols = ['coin_id'],
by = 'market_clusters'  
)
plot4 = market_predictions_pca.hvplot.scatter(
x='PCA1',
y='PCA2',
hover_cols = ['coin_id'],
by='market_clusters')

hv.Layout(plot3 + plot4).cols(1)

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** Chopping the data into fewer dimensions, in this case, seemed to over complicate the process. The most prominant clusters to use, 0 & 1, were already pretty obvious to see before we condensed the data. Using the fewer dimensional model was easier to determind K-Means but definitely not as useful when clustering the data.