# Clustering Crypto

In [7]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from pathlib import Path

### Fetching Cryptocurrency Data

In [3]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [5]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

#cryptocurrency_df = pd.read_csv(url)

In [8]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

file_path = Path("Resources/crypto_data.csv")


# Create a DataFrame

cryptocurrency_df = pd.read_csv(file_path)

### Data Preprocessing

In [9]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

cryptocurrency_df = cryptocurrency_df.set_index("Unnamed: 0")

In [10]:
# Keep only cryptocurrencies that are trading

cryptocurrency_df = cryptocurrency_df.loc[cryptocurrency_df["IsTrading"] == True]


In [11]:
# Keep only cryptocurrencies with a working algorithm

cryptocurrency_df = cryptocurrency_df.loc[cryptocurrency_df["Algorithm"] != "N/A"]

cryptocurrency_df


Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [12]:
# Remove the "IsTrading" column

cryptocurrency_df = cryptocurrency_df.drop(columns= "IsTrading")

cryptocurrency_df


Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [13]:
# Remove rows with at least 1 null value

cryptocurrency_df = cryptocurrency_df.dropna()


In [14]:
# Remove rows with cryptocurrencies having no coins mined

cryptocurrency_df = cryptocurrency_df.loc[cryptocurrency_df["TotalCoinsMined"] != 0]

cryptocurrency_df


Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [15]:
# Drop rows where there are 'N/A' text values

cryptocurrency_df = cryptocurrency_df[cryptocurrency_df != "N/A"]

cryptocurrency_df


Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [16]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df

coinname_df = pd.DataFrame(cryptocurrency_df["CoinName"], index=cryptocurrency_df.index)

coinname_df

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [17]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm

cryptocurrency_df = cryptocurrency_df.drop(columns="CoinName")

cryptocurrency_df


Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [18]:
# Create dummy variables for text features

dummy_variables = pd.get_dummies(cryptocurrency_df, columns=["Algorithm", "ProofType"], drop_first=True)

dummy_variables


Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,ProofType_PoW/PoS,ProofType_PoW/PoS,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Standardize data

cryptocurrency_scale = StandardScaler().fit_transform(dummy_variables)

cryptocurrency_scale


array([[-0.11674788, -0.15286468, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.09358885, -0.14499604, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [ 0.52587231,  4.4937636 , -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       ...,
       [-0.09523411, -0.13215444, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11658774, -0.15255408, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11674507, -0.15284989, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ]])

### Reducing Dimensions Using PCA

In [None]:
# Use PCA to reduce dimensions to 3 principal components

principal_components = PCA(n_components=3)

cryptocurrency_pca = PCA.fit_transform(cryptocurrency_scale)

In [None]:
# Create a DataFrame with the principal components data

principal_components_df = pd.DataFrame(cryptocurrency_pca, columns=["PC1", "PC2", "PC3"], index=cryptocurrency_df.index)

principal_components_df

### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [None]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values

for i in k:
    k_means = KMeans(n_clusters=i, random_state=0)
    k_means.fit(cryptocurrency_pca)
    inertia.append(k_means.inertia_)


# Create the Elbow Curve using hvPlot

elbow_curve = {"k": k, "inertia": inertia}
elbow_curve_df = pd.DataFrame(elbow_curve)
elbow_curve_df.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=<your best value for k here>`

In [None]:
# Initialize the K-Means model

k_means_model = KMeans(n_clusters=5, random_state=1)

# Fit the model

k_means_model.fit(principal_components_df)

# Predict clusters

predicted_model = k_means_model.predict(principal_components_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features

principal_components_df["class"] = k_means_model.labels_

principal_components_df


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [None]:
# Scale data to create the scatter plot

combined_df = pd.concat([cryptocurrency_df, coinname_df, principal_components_df], axis=1)

scale_data = plt.scatter_3d(combined_df, x="PC1", y="PC2", z="PC3", color= "class", hover_name= "CoinName", hover_data= ["Algorithm"], width=600)

scale_data.show()


In [None]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"

new_scale = MinMaxScaler()

new_columns = ["TotalCoinsMined", "TotalCoinsSupply"]
x = combined_df[new_columns].values
x_scale = new_scale.fit_transform(x)

scale_df = pd.DataFrame(x_scale, columns=new_columns, index=cryptocurrency_df.index)
scale_df["class"] = combined_df["class"]
scale_df["CoinName"] = combined_df["CoinName"]

new_plot = scale_df.hvplot(
    kind="scatter",
    x="TotalCoinsMined",
    y="TotalCoinsSupply",
    c="class",
    colormap="viridis",
    hover= ["CoinName"]
)

new_plot


#### Table of Tradable Cryptocurrencies

In [None]:
# Table with tradable cryptos

new_columns_df = ["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "Class"]

combined_df.hvplot.table(new_columns_df)


In [None]:
# Print the total number of tradable cryptocurrencies

print(f"The total number of tradable cryptocurrencies is {len(combined_df)}")