# Clustering Crypto

In [123]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [124]:
# Load the crypto_data.csv dataset.
file = "crypto_data.csv"
df = pd.read_csv(file)
df = df.rename(columns= {
    "Unnamed: 0" : "Index"
})
df.head(10)


Unnamed: 0,Index,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [125]:
# Keep all the cryptocurrencies that are being traded.
def change_string(IsTrading):
    if IsTrading == "True":
        return 1
    else:
        return 0
df["IsTrading"] = df["IsTrading"].drop(0)

In [126]:
# Remove the "IsTrading" column. 
df = df.drop(columns = "IsTrading")

In [127]:
# Remove rows that have at least 1 null value.
df = df.dropna()

In [128]:
# Keep the rows where coins are mined.
df = df[df["TotalCoinsMined"] > 0]
df.head()

Unnamed: 0,Index,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [129]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_names = df[["Index","CoinName"]]
crypto_names = crypto_names.set_index("Index")
crypto_names.head(5)

Unnamed: 0_level_0,CoinName
Index,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [130]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
df = df.drop(columns='CoinName')
df = df.set_index("Index")

In [131]:
df.head(10)

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0
LTC,Scrypt,PoW,63039240.0,84000000
DASH,X11,PoW/PoS,9031294.0,22000000
XMR,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethash,PoW,113359700.0,210000000
ZEC,Equihash,PoW,7383056.0,21000000


In [132]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(df)

In [133]:
# Standardize the data with StandardScaler().
crypto_scaled = StandardScaler().fit_transform(X)


### Deliverable 2: Reducing Data Dimensions Using PCA

In [134]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_scaled)

In [135]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(data= crypto_pca, 
                    index= df.index.copy(), 
                    columns=["PC1", "PC2", "PC3"])
pcs_df.head(10)

Unnamed: 0_level_0,PC1,PC2,PC3
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.391932,-0.171467,-0.057348
404,-0.376648,-0.170864,-0.050955
1337,0.284219,-0.06937,-0.015439
BTC,-0.085008,-0.10951,-0.017121
ETH,-0.144078,-0.279677,-0.020277
LTC,-0.300093,-0.266344,-0.037263
DASH,-0.240579,0.013209,-0.036938
XMR,-0.053695,-0.089941,0.004753
ETC,-0.295175,-0.315749,0.00438
ZEC,-0.109788,-0.126977,-0.033259


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [136]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve of PCA'd Crypto DF")


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



Running K-Means with `k=4`

In [145]:
# Initialize the K-Means model.
model = KMeans(n_clusters=8, random_state=0)
# Fit the model
model.fit(pcs_df)
# Predict clusters
predictions = model.predict(pcs_df)
pcs_df["Class"] = model.labels_

In [146]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([df, pcs_df.reindex(df.index)], axis= 1, join = "inner")
#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df = pd.concat([clustered_df, crypto_names["CoinName"]], axis= 1,)
#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# See above step!
# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(577, 9)


Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,Class,CoinName
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42,-0.391932,-0.171467,-0.057348,0,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.376648,-0.170864,-0.050955,0,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,0.284219,-0.06937,-0.015439,0,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.085008,-0.10951,-0.017121,0,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.144078,-0.279677,-0.020277,0,Ethereum
LTC,Scrypt,PoW,63039240.0,84000000,-0.300093,-0.266344,-0.037263,0,Litecoin
DASH,X11,PoW/PoS,9031294.0,22000000,-0.240579,0.013209,-0.036938,0,Dash
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.053695,-0.089941,0.004753,0,Monero
ETC,Ethash,PoW,113359700.0,210000000,-0.295175,-0.315749,0.00438,0,Ethereum Classic
ZEC,Equihash,PoW,7383056.0,21000000,-0.109788,-0.126977,-0.033259,0,ZCash


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [147]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC1",
    y = "PC2",
    z = "PC3",
    color="Class",
    symbol="Class",
    hover_name = "CoinName",
    hover_data = ["Algorithm"],
    width = 800,
    height = 800,
    template = "plotly_dark"
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [148]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined"], sortable=True,)

In [149]:
# Print the total number of tradable cryptocurrencies.
len(clustered_df.index)

577

In [150]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
from sklearn.preprocessing import MinMaxScaler as mms

coins = mms().fit_transform(clustered_df[["TotalCoinSupply", "TotalCoinsMined"]])
print(coins)


[[4.20000000e-11 0.00000000e+00]
 [5.32000000e-04 1.06585544e-03]
 [3.14159265e-01 2.95755135e-02]
 ...
 [2.10000000e-05 7.37028150e-06]
 [1.00000000e-06 1.29582282e-07]
 [1.00000000e-04 2.17085015e-05]]


In [151]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
coin_df = pd.DataFrame(coins, index= clustered_df.index.copy(), columns= ["TotalCoinSupply","TotalCoinsMined"])
# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
coin_df = pd.concat([coin_df, clustered_df["CoinName"]], axis= 1,)
# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df = pd.concat([coin_df, clustered_df["Class"]], axis= 1,)
plot_df.head(10)

Unnamed: 0_level_0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,0
ETH,0.0,0.000109,Ethereum,0
LTC,8.4e-05,6.4e-05,Litecoin,0
DASH,2.2e-05,9e-06,Dash,0
XMR,0.0,1.7e-05,Monero,0
ETC,0.00021,0.000115,Ethereum Classic,0
ZEC,2.1e-05,7e-06,ZCash,0


In [152]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", color="Class", title="Total Coins Mined & Total Coin Supply", xlabel = "Total Coins Mined", ylabel = "Total Coin Supply")