# Clustering Crypto

In [1]:
# Initial imports
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import altair as alt
import seaborn as sns
sns.set()

from pathlib import Path

### Fetching Cryptocurrency Data

In [359]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

r = requests.get(url)

data_online = r.json()

In [407]:
crypto_df_o = pd.DataFrame(data_online['Data']).T
crypto_df_o = crypto_df_o[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','MaxSupply']]
crypto_df_o = crypto_df_o.rename(columns={"MaxSupply":"TotalCoinSupply"})
crypto_df_o.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
433,433 Token,,False,,,


In [335]:
# Alternatively, use the provided csv file:
file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
crypto_df = pd.read_csv(file_path)

In [336]:
# Data cleaning
crypto_df['CoinName'] = crypto_df['CoinName'].str.strip()
crypto_df['TotalCoinSupply'] = pd.to_numeric(crypto_df['TotalCoinSupply'].str.replace(' ',''), errors='coerce')
crypto_df['ProofType'] = crypto_df['ProofType'].str.strip().replace('PoS/PoW','PoW/PoS').replace('PoW and PoS','PoW/PoS').replace('Proof of Authority', 'PoA')
crypto_df['ProofType'] = crypto_df['ProofType'].str.replace('Proof of Trust','PoT').replace('PoW/PoW','PoW/PoS').replace('dPoS','DPoS')
crypto_df['ProofType'] = crypto_df['ProofType'].str.replace('Pos','PoS').replace('dPoW/PoW','DPoW')
crypto_df['Algorithm'] = crypto_df['Algorithm'].str.strip().replace('Proof-of-Authority','PoA').replace('Leased POS','PoS')

print(f"Currently we have {crypto_df.shape[0]} crypto currencies\n")
print(crypto_df.dtypes)

Currently we have 1252 crypto currencies

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object


### Data Preprocessing

In [337]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']]


In [338]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df.loc[crypto_df['IsTrading'] == True, :].drop(columns=['IsTrading'])

In [339]:
# Keep only cryptocurrencies with a working algorithm
# I am not sure which algorithms dont work, so I demonstrate how to remote it
list_bad_alg = ['Dagger-Hashimoto']
idx_bad_alg = crypto_df['Algorithm'].isin(list_bad_alg)

crypto_df = crypto_df.loc[~idx_bad_alg,:]

In [340]:
# Remove the "IsTrading" column
# already removed

In [341]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna()


# we remove outliers

std_mined = crypto_df['TotalCoinsMined'].std()
std_supply = crypto_df['TotalCoinSupply'].std()


# crypto_df = crypto_df[crypto_df['TotalCoinSupply']< 4 * std_supply]
# crypto_df = crypto_df[crypto_df['TotalCoinsMined']< 4 * std_mined]

crypto_df = crypto_df[crypto_df['TotalCoinSupply'] >= 0]
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] > 0]



# crypto_df.plot.scatter(x='TotalCoinsMined',
#                       y='TotalCoinSupply')

In [342]:
# Remove rows with cryptocurrencies having no coins mined
# already done

In [343]:
# Drop rows where there are 'N/A' text values
crypto_df.isin(['N/A']).sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [344]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
CoinName = crypto_df[['CoinName']]
# CoinName.value_counts()

In [345]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.set_index(['CoinName'])

# show current dataframe 
crypto_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
CoinName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42 Coin,Scrypt,PoW/PoS,41.99995,42.0
404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
EliteCoin,X13,PoW/PoS,29279420000.0,314159300000.0
Bitcoin,SHA-256,PoW,17927180.0,21000000.0
Ethereum,Ethash,PoW,107684200.0,0.0


In [346]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df)
X

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoS/LPoS,ProofType_PoS/PoW/PoT,ProofType_PoST,ProofType_PoT,ProofType_PoW,ProofType_PoW + Hive,ProofType_PoW/PoS,ProofType_PoW/nPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof
CoinName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42 Coin,4.199995e+01,4.200000e+01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
404Coin,1.055185e+09,5.320000e+08,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
EliteCoin,2.927942e+10,3.141593e+11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Bitcoin,1.792718e+07,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Ethereum,1.076842e+08,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPHYR,2.000000e+09,2.000000e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gapcoin,1.493105e+07,2.500000e+08,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Beldex,9.802226e+08,1.400223e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Horizen,7.296538e+06,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [347]:
# Standardize data
X_scaler = StandardScaler()

X_scaled = X_scaler.fit_transform(X)

### Reducing Dimensions Using PCA

In [348]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)

In [351]:
# Create a DataFrame with the principal components data
pca_data = pca.fit_transform(X_scaled)
# explained_ratio = pca.explained_variance_ratio_

# print(explained_ratio, sum(explained_ratio).round(2))

pca_df = pd.DataFrame({'PCA1':pca_data[:,0],
                                 'PCA2':pca_data[:,1],
                                 'PCA3':pca_data[:,2]
                                }, index=CoinName.values.T[0])

# Review the PCA DataFrame
pca_df.head()

Unnamed: 0,PCA1,PCA2,PCA3
42 Coin,-0.343527,1.085607,-0.531086
404Coin,-0.326855,1.085898,-0.531704
EliteCoin,2.290806,1.702049,-0.583824
Bitcoin,-0.139584,-1.366379,0.140103
Ethereum,-0.149851,-2.07294,0.364715


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [352]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    model =  KMeans(n_clusters=i, random_state=42)
    model.fit(pca_df)
    inertia.append(model.inertia_)

# Create the DataFrame from the elbow data
elbow_df = pd.DataFrame({'k':k, 'inertia':inertia}).set_index('k')


alt.Chart(elbow_df.reset_index()).mark_line().encode(
    x='k',
    y='inertia'
).interactive().properties(
    title="Inertia for kMean",
    width=600,
    height=300,
)

Running K-Means with `k=4`

In [353]:
# Initialize the K-Means model
best_model =  KMeans(n_clusters=4, random_state=42)

# Fit the model
best_model.fit(pca_df)

# Predict clusters
cluster_prediction = best_model.predict(pca_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features

clustered_df = pd.concat([crypto_df, pca_df.copy()], axis=1)
clustered_df['TotalCoinsMined_scaled'] = X_scaled[:,0]
clustered_df['TotalCoinSupply_scaled'] = X_scaled[:,1]


# Add a class column with the labels
clustered_df['Class'] = cluster_prediction
print(f'There are {len(clustered_df)} coins left')
df_scatter = clustered_df.reset_index()
df_scatter = df_scatter.rename(columns={'index':'CoinName'})
clustered_df.head(10)

There are 531 coins left


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PCA1,PCA2,PCA3,TotalCoinsMined_scaled,TotalCoinSupply_scaled,Class
42 Coin,Scrypt,PoW/PoS,41.99995,42.0,-0.343527,1.085607,-0.531086,-0.11722,-0.153017,1
404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.326855,1.085898,-0.531704,-0.094103,-0.145163,1
EliteCoin,X13,PoW/PoS,29279420000.0,314159300000.0,2.290806,1.702049,-0.583824,0.524239,4.485014,1
Bitcoin,SHA-256,PoW,17927180.0,21000000.0,-0.139584,-1.366379,0.140103,-0.116827,-0.152707,0
Ethereum,Ethash,PoW,107684200.0,0.0,-0.149851,-2.07294,0.364715,-0.114861,-0.153017,0
Litecoin,Scrypt,PoW,63039240.0,84000000.0,-0.159171,-1.098255,-0.054437,-0.115839,-0.151777,0
Dash,X11,PoW/PoS,9031294.0,22000000.0,-0.409767,1.236282,-0.418075,-0.117022,-0.152693,1
Monero,CryptoNight-V7,PoW,17201140.0,0.0,-0.133203,-2.22091,0.330681,-0.116843,-0.153017,0
Ethereum Classic,Ethash,PoW,113359700.0,210000000.0,-0.148293,-2.073025,0.364681,-0.114736,-0.149917,0
ZCash,Equihash,PoW,7383056.0,21000000.0,-0.155396,-2.197578,0.259436,-0.117058,-0.152707,0


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [354]:
# Scale data to create the scatter plot
alt.Chart(df_scatter).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color='Class:N',
    tooltip = ['CoinName','Algorithm','TotalCoinsMined','TotalCoinSupply' ]
).properties(
    width=150,
    height=150
).repeat(
    row=['TotalCoinsMined_scaled', 'TotalCoinSupply_scaled'],
    column=['TotalCoinsMined_scaled', 'TotalCoinSupply_scaled']
).interactive()

In [355]:
alt.Chart(df_scatter).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color='Class:N',
    tooltip = ['CoinName','Algorithm','TotalCoinsMined','TotalCoinSupply']
).properties(
    width=150,
    height=150
).repeat(
    row=['PCA1', 'PCA2', 'PCA3'],
    column=['PCA1', 'PCA2', 'PCA3']
).interactive()

In [357]:
alt.Chart(df_scatter).mark_circle(size=60).encode(
    x='TotalCoinsMined_scaled:Q',
    y='TotalCoinSupply_scaled:Q',
    color='Class:N',
    tooltip = ['CoinName','Algorithm','TotalCoinsMined','TotalCoinSupply' ]
).interactive()

#### Table of Tradable Cryptocurrencies

In [306]:
# Table with tradable cryptos
display(clustered_df)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PCA1,PCA2,PCA3,TotalCoinsMined_scaled,TotalCoinSupply_scaled,Class
42 Coin,Scrypt,PoW/PoS,4.199995e+01,4.200000e+01,-1.000835,0.406062,-0.465766,-0.236344,-0.151501,0
404Coin,Scrypt,PoW/PoS,1.055185e+09,5.320000e+08,-0.980800,0.427372,-0.467427,-0.148794,-0.141195,0
EliteCoin,X13,PoW/PoS,2.927942e+10,3.141593e+11,0.128926,1.999892,-0.666223,2.193008,5.934541,0
Bitcoin,SHA-256,PoW,1.792718e+07,2.100000e+07,1.028471,-0.788157,0.114605,-0.234857,-0.151095,1
Ethereum,Ethash,PoW,1.076842e+08,0.000000e+00,1.652497,-1.129891,0.250073,-0.227409,-0.151501,1
...,...,...,...,...,...,...,...,...,...,...
ZEPHYR,SHA-256,DPoS,2.000000e+09,2.000000e+09,-0.937802,-0.036963,0.604038,-0.070401,-0.112756,0
Gapcoin,Scrypt,PoW/PoS,1.493105e+07,2.500000e+08,-0.999596,0.406874,-0.465818,-0.235105,-0.146658,0
Beldex,CryptoNight,PoW,9.802226e+08,1.400223e+09,2.247225,-0.974159,0.273065,-0.155014,-0.124376,1
Horizen,Equihash,PoW,7.296538e+06,2.100000e+07,1.614475,-1.121933,0.158134,-0.235739,-0.151095,1


In [305]:
# Print the total number of tradable cryptocurrencies
print(f'There are {len(clustered_df)} of tradable cryptocurrencies')

There are 528 of tradable cryptocurrencies


In [311]:
# import hvplot.pandas
# clustered_df.hvplot.table(sortable=True, selectable=True)