# Clustering Crypto

In [35]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Fetching Cryptocurrency Data

In [36]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"


In [38]:
from path import Path

In [39]:
# Alternatively, use the provided csv file:

file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame

df_crypto = pd.read_csv(file_path, index_col = "Unnamed: 0")
df_crypto.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


### Data Preprocessing

In [40]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

df_crypto.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [41]:
# Keep only cryptocurrencies that are trading

df_crypto = df_crypto[df_crypto["IsTrading"] == True]

In [42]:
df_crypto[df_crypto['CoinName'] == '300']

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply


In [43]:
# Keep only cryptocurrencies with a working algorithm

df_crypto = df_crypto[df_crypto["Algorithm"] != 'N/A']

In [44]:
display(df_crypto['Algorithm'].value_counts())

Scrypt                394
X11                   182
SHA-256               121
X13                    54
PoS                    42
                     ... 
Proof-of-Authority      1
YescryptR16             1
ECC 256K1               1
QUAIT                   1
Cryptonight-GPU         1
Name: Algorithm, Length: 89, dtype: int64

In [45]:
# Remove the 'IsTrading' column

df_crypto.drop(columns = "IsTrading", inplace = True)

In [46]:
# Remove rows with at least 1 null value

df_crypto.dropna(inplace = True)

In [47]:
# Remove rows with cryptocurrencies having no coins mined

df_crypto = df_crypto[df_crypto["TotalCoinsMined"] > 0]

In [48]:
# Drop rows where there are 'N/A' text values

df_crypto = df_crypto[df_crypto.iloc[:] != 'N/A']

In [49]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 

df_new = df_crypto["CoinName"]
df_new.head()

42        42 Coin
404       404Coin
1337    EliteCoin
BTC       Bitcoin
ETH      Ethereum
Name: CoinName, dtype: object

In [50]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm

df_crypto.drop(columns = "CoinName", inplace = True)

df_crypto.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [51]:
# Create dummy variables for text features
X = pd.get_dummies(df_crypto, columns = ["Algorithm", "ProofType"])

X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
# Standardize data

X = StandardScaler().fit_transform(X)

X[:2]

array([[-0.11710817, -0.1528703 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.06142951, -0.07530656, -0.0433963 , -0.06142951, -0.06142951,
        -0.0433963 , -0.0433963 , -0.19245009, -0.06142951, -0.09740465,
        -0.0433963 , -0.11547005, -0.07530656, -0.0433963 , -0.0433963 ,
        -0.15191091, -0.0433963 , -0.13118084, -0.0433963 , -0.0433963 ,
        -0.08703883, -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.06142951, -0.0433963 , -0.08703883, -0.08703883, -0.08703883,
        -0.0433963 , -0.13118084, -0.13840913, -0.13840913, -0.0433963 ,
        -0.06142951, -0.0433963 , -0.07530656, -0.18168574, -0.0433963 ,
        -0.0433963 , -0.0433963 , -0.07530656, -0.15826614, -0.31491833,
        -0.0433963 , -0.08703883, -0.07530656, -0.06142951,  1.38675049,
        -0.0433963 , -0.0433963 , -0.06142951, -0.0433963 , -0.0433963 ,
        -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.0433963 , -0.39879994, -0.0433963 , -0.1

### Reducing Dimensions Using PCA

In [53]:
# Use PCA to reduce dimensions to 3 principal components

#Initialize PCA model
pca = PCA(n_components=3)

# three principal components for the data.
X_pca = pca.fit_transform(X)

In [54]:
# Create a DataFrame with the principal components data

pca_df = pd.DataFrame(
    data=X_pca, columns=["pc1", "pc2", "pc3"], index = df_crypto.index
)
pca_df.head()

Unnamed: 0,pc1,pc2,pc3
42,-0.336265,1.026418,-0.580594
404,-0.319652,1.02642,-0.581025
1337,2.312178,1.680129,-0.690679
BTC,-0.12872,-1.308284,0.192351
ETH,-0.141467,-1.986917,0.358439


### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [55]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:

    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)


Running K-Means with `k=<your best value for k here>`

In [67]:
# Initialize the K-Means model

model = KMeans(n_clusters= 4, random_state=5)

# Fit the model

model.fit(pca_df)

# Predict clusters

predictions = model.predict(pca_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features

df_clustered = pd.concat([df_crypto, pca_df], axis = 1)
df_clustered["pred_clusters"] = model.labels_
df_clustered["CoinName"] = df_new.values
df_clustered.head()


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,pc1,pc2,pc3,pred_clusters,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.336265,1.026418,-0.580594,0,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.319652,1.02642,-0.581025,0,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.312178,1.680129,-0.690679,0,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.12872,-1.308284,0.192351,1,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.141467,-1.986917,0.358439,1,Ethereum


In [58]:
len (model.labels_)

532

In [59]:
df_crypto.shape

(532, 4)

In [60]:
pca_df.shape

(532, 3)

In [69]:
import plotly.express as px

AttributeError: module 'pandas' has no attribute 'Panel'

### Visualizing Results

#### 3D-Scatter with Clusters

In [68]:
# Create a 3D-Scatter with the PCA data and the clusters

fig = px.scatter_3d(
    pca_df,
    x="pc3",
    y="pc2",
    z="pc1",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

NameError: name 'px' is not defined

#### Table of Tradable Cryptocurrencies

In [76]:
# Table with tradable cryptos

df_clustered[['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'pred_clusters']].hvplot.table()

In [82]:
# Print the total number of tradable cryptocurrencies

df_clustered.shape[0]

532

In [80]:
tuple = (1, 5)
tuple[1]

5

#### Scatter Plot with Tradable Cryptocurrencies

In [98]:
# Scale data to create the scatter plot

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df_clustered[["TotalCoinsMined", "TotalCoinSupply"]])
scaled_data = pd.DataFrame(scaled_data, columns = ["TotalCoinsMined", "TotalCoinSupply"], index = df_clustered.index)
scaled_data["CoinName"] = df_clustered["CoinName"]
scaled_data["pred_clusters"] = df_clustered["pred_clusters"]

In [99]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"

scaled_data.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    hover_cols=["CoinName"],
    by="pred_clusters",
)

In [97]:
scaled_data.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,pred_clusters
42,0.0,4.2e-11,0
404,0.001066,0.000532,0
1337,0.029576,0.3141593,0
BTC,1.8e-05,2.1e-05,1
ETH,0.000109,0.0,1


In [94]:
df_clustered["pred_clusters"]

42      0
404     0
1337    0
BTC     1
ETH     1
       ..
ZEPH    0
GAP     0
BDX     1
ZEN     1
XBC     0
Name: pred_clusters, Length: 532, dtype: int32