<a href="https://colab.research.google.com/github/skinnytwinvale/Cryptocurrencies/blob/main/crypto_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clustering Crypto

In [16]:
# Initial imports
import pandas as pd
import hvplot.pandas
import holoviews as hv
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


In [14]:
!pip install holoviews

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
!pip install plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
!pip install path

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting path
  Downloading path-16.4.0-py3-none-any.whl (26 kB)
Installing collected packages: path
Successfully installed path-16.4.0


In [10]:
!pip install hvplot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hvplot
  Downloading hvplot-0.8.0-py2.py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 10.2 MB/s 
Installing collected packages: hvplot
Successfully installed hvplot-0.8.0


### Deliverable 1: Preprocessing the Data for PCA

In [26]:
# Load the crypto_data.csv dataset.
from google.colab import files
uploaded = files.upload()

Saving crypto_data.csv to crypto_data (1).csv


In [27]:
import io

In [28]:
file_path = Path('/Users/bigv/Desktop/Rutgers/Cryptocurrencies/crypto_data.csv')

In [37]:
df = pd.read_csv(io.BytesIO(uploaded['crypto_data.csv']))[:-2]

In [38]:
print(df)

     Unnamed: 0     CoinName    Algorithm  IsTrading ProofType  \
0            42      42 Coin       Scrypt       True   PoW/PoS   
1           365      365Coin          X11       True   PoW/PoS   
2           404      404Coin       Scrypt       True   PoW/PoS   
3           611    SixEleven      SHA-256       True       PoW   
4           808          808      SHA-256       True   PoW/PoS   
...         ...          ...          ...        ...       ...   
1245        BDX       Beldex  CryptoNight       True       PoW   
1246        ZEN      Horizen     Equihash       True       PoW   
1247        XBC  BitcoinPlus       Scrypt       True       PoS   
1248       DVTC   DivotyCoin       Scrypt      False   PoW/PoS   
1249       GIOT  Giotto Coin       Scrypt      False   PoW/PoS   

      TotalCoinsMined TotalCoinSupply  
0        4.199995e+01              42  
1                 NaN      2300000000  
2        1.055185e+09       532000000  
3                 NaN          611000  
4      

In [40]:
# Keep all the cryptocurrencies that are being traded.
df1 = df.loc[df['IsTrading'] == True]
df1.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [44]:
# Keep all the cryptocurrencies that have a working algorithm.
df2 = df1.dropna(axis=0, subset=['Algorithm'])
df2.info()

df0 = df1.sort_values(by='Algorithm', ascending=False)
df0.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1144 entries, 0 to 1247
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1144 non-null   object 
 1   CoinName         1144 non-null   object 
 2   Algorithm        1144 non-null   object 
 3   IsTrading        1144 non-null   bool   
 4   ProofType        1144 non-null   object 
 5   TotalCoinsMined  685 non-null    float64
 6   TotalCoinSupply  1144 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 63.7+ KB


Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
1112,AQUA,Aquachain,Argon2,True,PoW,0.0,42000000
644,OPES,Opes,Argon2,True,PoW,,52000000
697,BOAT,Doubloon,536,True,PoW/PoS,,500000000
748,ESP,Espers,536,True,PoW/PoS,22801880000.0,50000000000
336,HODL,HOdlcoin,1GB AES Pattern Search,True,PoW,11448950.0,81962100


In [46]:
# Remove the "IsTrading" column. 
df3 = df2.drop(['IsTrading'], axis=1)
df3.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [47]:
# Remove rows that have at least 1 null value.
df4 = df3.dropna()
df4.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [48]:
# Keep the rows where coins are mined.
df5 = df4.loc[df4['TotalCoinsMined'] > 0]
df5.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [49]:
# Create a new DataFrame that holds only the cryptocurrencies names.
cc_names_df = df5[["CoinName"]]
cc_names_df.head()

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum


In [50]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
crypto_df = df5.drop(['CoinName'], axis=1)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [60]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType', 'Unnamed: 0'])
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,Unnamed: 0_ZEC,Unnamed: 0_ZEIT,Unnamed: 0_ZEN,Unnamed: 0_ZEPH,Unnamed: 0_ZER,Unnamed: 0_ZET,Unnamed: 0_ZNE,Unnamed: 0_ZNY,Unnamed: 0_ZOI,Unnamed: 0_ZYD
0,41.99995,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-0.11710817 -0.1528703  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.09396955 -0.145009   -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [ 0.52494561  4.48942416 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11671506 -0.15255998 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11474682 -0.1528703  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]]


### Deliverable 2: Reducing Data Dimensions Using PCA

In [62]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
pca

PCA(n_components=3)

In [63]:
# Create a DataFrame with the three principal components.
X_pca = pca.fit_transform(X_scaled)
X_pca

array([[-0.3972203 ,  1.10909272, -0.63471847],
       [-0.37703469,  1.1107411 , -0.63276045],
       [ 2.70220376,  1.99252462, -0.76505793],
       ...,
       [ 0.41421584, -2.78907539,  0.53201421],
       [-0.1159447 , -2.44129434,  0.54227355],
       [-0.38507313,  1.09965596, -0.29527015]])

In [64]:
index_values = (X.index.tolist())
index_values

[0,
 2,
 5,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 19,
 20,
 24,
 27,
 37,
 39,
 42,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 54,
 61,
 62,
 63,
 73,
 74,
 76,
 77,
 80,
 81,
 82,
 84,
 85,
 86,
 94,
 95,
 98,
 99,
 100,
 101,
 107,
 110,
 112,
 113,
 114,
 120,
 122,
 124,
 129,
 130,
 131,
 133,
 135,
 137,
 139,
 140,
 144,
 146,
 147,
 148,
 149,
 151,
 153,
 155,
 157,
 162,
 168,
 174,
 175,
 176,
 179,
 180,
 181,
 182,
 183,
 186,
 187,
 188,
 189,
 190,
 193,
 195,
 196,
 197,
 200,
 202,
 203,
 205,
 208,
 210,
 212,
 214,
 215,
 218,
 220,
 224,
 225,
 226,
 227,
 228,
 229,
 231,
 232,
 234,
 235,
 239,
 244,
 246,
 247,
 248,
 251,
 254,
 256,
 263,
 267,
 273,
 276,
 281,
 286,
 289,
 294,
 301,
 302,
 303,
 307,
 315,
 320,
 321,
 322,
 324,
 325,
 327,
 330,
 331,
 332,
 333,
 335,
 336,
 337,
 338,
 339,
 341,
 343,
 345,
 347,
 352,
 355,
 358,
 359,
 362,
 363,
 369,
 371,
 372,
 373,
 375,
 385,
 391,
 397,
 403,
 404,
 413,
 420,
 422,
 426,
 428,
 43

In [65]:
pcs_df = pd.DataFrame(data = X_pca, columns=["PC 1", "PC 2", "PC 3"], index = index_values)
pcs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.39722,1.109093,-0.634718
2,-0.377035,1.110741,-0.63276
5,2.702204,1.992525,-0.765058
7,-0.141643,-1.49969,0.15411
8,-0.214176,-2.335362,0.42004


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [66]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

In [67]:
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

In [68]:
# Plot the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

Running K-Means with `k=4`

In [70]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
print(predictions)
pcs_df["Class"] = model.labels_

[3 3 3 0 0 0 3 0 0 0 3 0 3 3 0 3 0 0 3 3 0 0 0 0 0 3 0 0 0 3 0 3 0 0 3 3 0
 0 0 0 0 0 3 3 0 0 0 0 0 3 3 0 3 0 0 0 0 3 0 0 3 0 3 3 3 0 0 0 3 3 3 3 3 0
 0 0 3 3 0 3 0 3 3 0 0 0 0 3 3 0 3 0 0 3 3 0 3 3 0 0 3 3 0 3 3 0 3 0 3 0 3
 0 3 3 0 0 3 0 0 0 3 0 0 0 0 0 3 0 0 0 0 3 0 3 0 0 3 0 3 0 3 3 0 0 3 0 0 3
 3 0 3 0 3 3 3 0 0 0 0 3 3 3 3 3 0 0 3 3 3 3 3 0 3 3 3 3 3 0 3 0 3 3 0 3 0
 3 3 0 3 0 3 0 0 0 3 3 3 3 0 3 3 3 3 3 0 0 3 3 0 0 3 3 3 3 3 0 3 3 3 3 3 3
 3 3 0 3 3 3 3 3 3 0 0 0 3 3 3 3 0 3 0 3 3 0 3 0 0 3 0 0 3 0 3 3 3 0 3 3 0
 3 3 3 3 3 3 3 0 3 0 3 3 3 3 0 3 0 3 0 0 0 0 3 0 3 3 0 3 0 0 0 3 0 3 0 0 0
 3 0 3 0 3 3 3 0 3 0 0 0 0 0 3 3 0 3 3 3 0 3 0 3 0 3 0 3 3 3 3 0 3 3 0 3 3
 3 0 0 0 0 3 3 3 3 0 3 0 0 0 3 3 0 0 3 3 0 3 0 0 0 3 0 0 3 3 3 0 0 0 3 3 3
 0 0 3 0 0 0 0 3 1 1 0 0 0 3 1 3 3 3 3 0 0 0 0 3 3 3 0 3 0 3 3 3 3 0 3 3 0
 3 3 0 0 3 0 3 0 0 0 0 3 3 0 3 0 3 3 3 3 3 3 0 0 0 3 3 3 3 3 3 0 3 0 0 0 0
 3 3 3 3 0 3 3 0 3 3 0 1 0 3 0 0 3 3 0 3 0 0 3 0 0 3 0 3 0 3 3 0 3 3 3 3 3
 0 0 0 3 3 3 0 3 0 3 0 3 

In [69]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = crypto_df.join(pcs_df, how='inner')
clustered_df.head()

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df = clustered_df.join(cc_names_df, how='inner')
clustered_df.head()

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
#print(clustered_df.shape)
#clustered_df.head(10)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName
0,42,Scrypt,PoW/PoS,41.99995,42,-0.39722,1.109093,-0.634718,42 Coin
2,404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.377035,1.110741,-0.63276,404Coin
5,1337,X13,PoW/PoS,29279420000.0,314159265359,2.702204,1.992525,-0.765058,EliteCoin
7,BTC,SHA-256,PoW,17927180.0,21000000,-0.141643,-1.49969,0.15411,Bitcoin
8,ETH,Ethash,PoW,107684200.0,0,-0.214176,-2.335362,0.42004,Ethereum
9,LTC,Scrypt,PoW,63039240.0,84000000,-0.21443,-1.363056,-0.013804,Litecoin
10,DASH,X11,PoW/PoS,9031294.0,22000000,-0.467514,1.324996,-0.629233,Dash
11,XMR,CryptoNight-V7,PoW,17201140.0,0,-0.179758,-2.902356,0.472059,Monero
12,ETC,Ethash,PoW,113359700.0,210000000,-0.212432,-2.333306,0.419613,Ethereum Classic
13,ZEC,Equihash,PoW,7383056.0,21000000,-0.11473,-2.43808,0.541101,ZCash


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [73]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df, 
    x="PC 1", 
    y="PC 2", 
    z="PC 3", 
    color="CoinName", 
    symbol="CoinName", 
    hover_name="CoinName", 
    hover_data=["Algorithm", "TotalCoinsMined", "TotalCoinSupply"])
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [75]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply'], sortable=True, selectable=True)

In [76]:
# Print the total number of tradable cryptocurrencies.
clustered_df['CoinName'].count()

532

In [77]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
cluster_df = clustered_df[['TotalCoinSupply', 'TotalCoinsMined']]
X_minmax = MinMaxScaler().fit_transform(cluster_df)
X_minmax

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [82]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
index_values = (clustered_df.index.tolist())
plot_df = pd.DataFrame(
    data = X_minmax, columns=["TotalCoinSupply_scaled", "TotalCoinsMined_scaled"], index = index_values)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df = plot_df.join(cc_names_df, how='inner')

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
coinname_df = clustered_df['CoinName']
plot_df = plot_df.join(coinname_df, how='inner', lsuffix='_left', rsuffix='_right')

plot_df.head(10)

Unnamed: 0,TotalCoinSupply_scaled,TotalCoinsMined_scaled,CoinName_left,CoinName_right
0,4.2e-11,0.0,42 Coin,42 Coin
2,0.000532,0.001066,404Coin,404Coin
5,0.3141593,0.029576,EliteCoin,EliteCoin
7,2.1e-05,1.8e-05,Bitcoin,Bitcoin
8,0.0,0.000109,Ethereum,Ethereum
9,8.4e-05,6.4e-05,Litecoin,Litecoin
10,2.2e-05,9e-06,Dash,Dash
11,0.0,1.7e-05,Monero,Monero
12,0.00021,0.000115,Ethereum Classic,Ethereum Classic
13,2.1e-05,7e-06,ZCash,ZCash


In [84]:
plot_df.head()

Unnamed: 0,TotalCoinSupply_scaled,TotalCoinsMined_scaled,CoinName_left,CoinName_right
0,4.2e-11,0.0,42 Coin,42 Coin
2,0.000532,0.001066,404Coin,404Coin
5,0.3141593,0.029576,EliteCoin,EliteCoin
7,2.1e-05,1.8e-05,Bitcoin,Bitcoin
8,0.0,0.000109,Ethereum,Ethereum


In [86]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(x="TotalCoinsMined_scaled", y="TotalCoinSupply_scaled", by="CoinName_left",
                          xlabel="Total Cryptocurrency Coins Mined",
                          ylabel="Total Cryptocurrency Coin Supply",
                          )