# Clustering Crypto

In [16]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [17]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

response = requests.get(url).json()

In [18]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
crypto_df = pd.DataFrame(response['Data']).T
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,0.000244,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [19]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame

### Data Preprocessing

In [26]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df=crypto_df[['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined', 'MaxSupply']]
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
365,365Coin,X11,True,PoW/PoS,0.0,-1
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,True,PoW,0.0,0
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0
2015,2015 coin,X11,True,PoW/PoS,0.0,0
XPD,PetroDollar,SHA-256D,True,,0.0,-1
ACOIN,ACoin,SHA-256,True,PoW,0.0,0
XMY,MyriadCoin,Multiple,True,PoW,0.0,2000000000


In [27]:
# Keep only cryptocurrencies that are trading
crypto_df=crypto_df[crypto_df["IsTrading"] == True]
print(crypto_df.shape)
crypto_df.head(10)

(698, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
365,365Coin,X11,True,PoW/PoS,0.0,-1
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,True,PoW,0.0,0
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0
2015,2015 coin,X11,True,PoW/PoS,0.0,0
XPD,PetroDollar,SHA-256D,True,,0.0,-1
ACOIN,ACoin,SHA-256,True,PoW,0.0,0
XMY,MyriadCoin,Multiple,True,PoW,0.0,2000000000


In [28]:
# Keep only cryptocurrencies with a working algorithm
crypto_df=crypto_df[crypto_df["Algorithm"] != "N/A"]
print(crypto_df.shape)
crypto_df.head(10)

(698, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
365,365Coin,X11,True,PoW/PoS,0.0,-1
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,True,PoW,0.0,0
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0
2015,2015 coin,X11,True,PoW/PoS,0.0,0
XPD,PetroDollar,SHA-256D,True,,0.0,-1
ACOIN,ACoin,SHA-256,True,PoW,0.0,0
XMY,MyriadCoin,Multiple,True,PoW,0.0,2000000000


In [29]:
# Remove the "IsTrading" column
crypto_df.drop("IsTrading", axis=1)
print(crypto_df.shape)
crypto_df.head(10)

(698, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
365,365Coin,X11,True,PoW/PoS,0.0,-1
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,True,PoW,0.0,0
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0
2015,2015 coin,X11,True,PoW/PoS,0.0,0
XPD,PetroDollar,SHA-256D,True,,0.0,-1
ACOIN,ACoin,SHA-256,True,PoW,0.0,0
XMY,MyriadCoin,Multiple,True,PoW,0.0,2000000000


In [31]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna(axis=0, how="any")
print(crypto_df.shape)
crypto_df.head(10)

(698, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
365,365Coin,X11,True,PoW/PoS,0.0,-1
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,True,PoW,0.0,0
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0
2015,2015 coin,X11,True,PoW/PoS,0.0,0
XPD,PetroDollar,SHA-256D,True,,0.0,-1
ACOIN,ACoin,SHA-256,True,PoW,0.0,0
XMY,MyriadCoin,Multiple,True,PoW,0.0,2000000000


In [33]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df["TotalCoinsMined"] >0]
print(crypto_df.shape)
crypto_df.head(10)

(307, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
NSR,NuShares,PoS,True,PoS,6174061457.8311,0
TRI,Triangles Coin,X13,True,PoW/PoS,191620.848067,0
CMTC,CometCoin,Scrypt,True,PoW,872830.0,0
CHAT,OpenChat,Scrypt,True,PoW/PoS,1000000000.0,-1
QRL,Quantum Resistant Ledger,RandomX,True,PoW,76056603.447898,105000000
PURA,Pura,X11,True,PoW,188358976.839698,-1
ADK,Aidos Kuneen,IMesh,True,PoW,25000000.0,0
DAPS,DAPS Coin,Dagger,True,PoW/PoS/PoA,62319462900.0,70000000000
FOIN,Foin,SHA-256,True,,92631000.8161,100000000


In [34]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df.iloc[:] != 'N/A'].dropna()
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42
NSR,NuShares,PoS,True,PoS,6174061457.8311,0
TRI,Triangles Coin,X13,True,PoW/PoS,191620.848067,0
CMTC,CometCoin,Scrypt,True,PoW,872830.0,0
CHAT,OpenChat,Scrypt,True,PoW/PoS,1000000000.0,-1
QRL,Quantum Resistant Ledger,RandomX,True,PoW,76056603.447898,105000000
PURA,Pura,X11,True,PoW,188358976.839698,-1
ADK,Aidos Kuneen,IMesh,True,PoW,25000000.0,0
DAPS,DAPS Coin,Dagger,True,PoW/PoS/PoA,62319462900.0,70000000000
ZANO,Zano,ProgPowZ,True,PoW/PoS,13095648.382489,-1


In [36]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coins_name = pd.DataFrame(crypto_df["CoinName"], index=crypto_df.index)
print(coins_name.shape)
coins_name.head(10)

(133, 1)


Unnamed: 0,CoinName
42,42 Coin
NSR,NuShares
TRI,Triangles Coin
CMTC,CometCoin
CHAT,OpenChat
QRL,Quantum Resistant Ledger
PURA,Pura
ADK,Aidos Kuneen
DAPS,DAPS Coin
ZANO,Zano


In [37]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop("CoinName", axis=1)
print(crypto_df.shape)
crypto_df.head(10)

(133, 5)


Unnamed: 0,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,Scrypt,True,PoW/PoS,41.999952,42
NSR,PoS,True,PoS,6174061457.8311,0
TRI,X13,True,PoW/PoS,191620.848067,0
CMTC,Scrypt,True,PoW,872830.0,0
CHAT,Scrypt,True,PoW/PoS,1000000000.0,-1
QRL,RandomX,True,PoW,76056603.447898,105000000
PURA,X11,True,PoW,188358976.839698,-1
ADK,IMesh,True,PoW,25000000.0,0
DAPS,Dagger,True,PoW/PoS/PoA,62319462900.0,70000000000
ZANO,ProgPowZ,True,PoW/PoS,13095648.382489,-1


In [38]:
# Create dummy variables for text features
X = pd.get_dummies(data=crypto_df, columns=["Algorithm", "ProofType"])
print(X.shape)
X.head(10)

(133, 83)


Unnamed: 0,IsTrading,TotalCoinsMined,MaxSupply,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-20 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,...,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_ProgPoW/PoS,ProofType_Proof of Authority,ProofType_Proof-of-Work,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW
42,True,41.999952,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,True,6174061457.8311,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,True,191620.848067,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,True,872830.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,True,1000000000.0,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
QRL,True,76056603.447898,105000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PURA,True,188358976.839698,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ADK,True,25000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DAPS,True,62319462900.0,70000000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZANO,True,13095648.382489,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# Standardize data
X = StandardScaler().fit_transform(X)
X[:5]

array([[ 0.        , -0.08885976, -0.09325275, -0.08703883, -0.08703883,
        -0.08703883, -0.12356041, -0.08703883, -0.08703883, -0.12356041,
        -0.12356041, -0.15191091, -0.08703883, -0.08703883, -0.08703883,
        -0.23570226, -0.12356041, -0.08703883, -0.08703883, -0.08703883,
        -0.3002731 , -0.08703883, -0.08703883, -0.23570226, -0.08703883,
        -0.08703883, -0.12356041, -0.08703883, -0.08703883, -0.08703883,
        -0.08703883, -0.08703883, -0.08703883, -0.15191091, -0.08703883,
        -0.08703883, -0.12356041, -0.19764235, -0.08703883, -0.08703883,
        -0.12356041, -0.12356041, -0.3002731 , -0.12356041, -0.08703883,
        -0.08703883, -0.08703883,  2.18691762, -0.08703883, -0.08703883,
        -0.08703883, -0.08703883, -0.17609018, -0.08703883, -0.19764235,
        -0.12356041, -0.08703883, -0.08703883, -0.08703883, -0.08703883,
        -0.08703883, -0.25298221, -0.08703883, -0.08703883, -0.12356041,
        -0.12356041, -0.08703883, -0.31491833, -0.0

### Reducing Dimensions Using PCA

In [40]:
# Use PCA to reduce dimensions to 3 principal components
n_comp = 3
pca = PCA(n_components=n_comp)
principal_components = pca.fit_transform(X)
principal_components

array([[ 1.96758295e-01, -1.24561080e+00, -1.32022885e+00],
       [ 7.27993314e-01, -1.26380312e+00, -4.59097220e-01],
       [ 6.33460608e-01, -1.93655266e+00, -1.66606769e+00],
       [-8.38309323e-01,  4.93257512e-01, -3.10246894e-01],
       [ 1.96763120e-01, -1.24560705e+00, -1.32022886e+00],
       [-1.16887464e+00,  1.23903866e+00,  2.72030372e-01],
       [-6.40496508e-01,  3.01340064e-01, -2.17566014e-01],
       [-8.83977816e-01,  8.60563617e-01,  3.02043999e-01],
       [ 8.69317861e-01, -2.17464777e+00,  7.32883217e+00],
       [ 5.93062118e-01, -1.92689185e+00, -1.73532018e+00],
       [ 5.93043042e-01, -1.92686139e+00, -1.73531287e+00],
       [-1.16887044e+00,  1.23903186e+00,  2.72028806e-01],
       [-1.16991725e+00,  1.24071109e+00,  2.72416601e-01],
       [-8.38304669e-01,  4.93261135e-01, -3.10246903e-01],
       [ 5.93062084e-01, -1.92689188e+00, -1.73532018e+00],
       [-1.24785244e+00,  1.33808223e+00,  2.27958748e-01],
       [ 9.12317695e-01, -1.50761368e+00

In [41]:
# Create a DataFrame with the principal components data
col_names = [f"PC {i}" for i in range(1, n_comp + 1)]
pcs_df = pd.DataFrame(principal_components, columns=col_names, index=crypto_df.index)
print(pcs_df.shape)
pcs_df.head(10)

(133, 3)


Unnamed: 0,PC 1,PC 2,PC 3
42,0.196758,-1.245611,-1.320229
NSR,0.727993,-1.263803,-0.459097
TRI,0.633461,-1.936553,-1.666068
CMTC,-0.838309,0.493258,-0.310247
CHAT,0.196763,-1.245607,-1.320229
QRL,-1.168875,1.239039,0.27203
PURA,-0.640497,0.30134,-0.217566
ADK,-0.883978,0.860564,0.302044
DAPS,0.869318,-2.174648,7.328832
ZANO,0.593062,-1.926892,-1.73532


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [42]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

  "KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [43]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df, pcs_df], axis=1, sort=False)
clustered_df["CoinName"] = coins_name["CoinName"]
clustered_df["Class"] = model.labels_
print(clustered_df.shape)
clustered_df.head(10)

(133, 10)


Unnamed: 0,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,True,PoW/PoS,41.999952,42,0.196758,-1.245611,-1.320229,42 Coin,3
NSR,PoS,True,PoS,6174061457.8311,0,0.727993,-1.263803,-0.459097,NuShares,3
TRI,X13,True,PoW/PoS,191620.848067,0,0.633461,-1.936553,-1.666068,Triangles Coin,3
CMTC,Scrypt,True,PoW,872830.0,0,-0.838309,0.493258,-0.310247,CometCoin,1
CHAT,Scrypt,True,PoW/PoS,1000000000.0,-1,0.196763,-1.245607,-1.320229,OpenChat,3
QRL,RandomX,True,PoW,76056603.447898,105000000,-1.168875,1.239039,0.27203,Quantum Resistant Ledger,1
PURA,X11,True,PoW,188358976.839698,-1,-0.640497,0.30134,-0.217566,Pura,1
ADK,IMesh,True,PoW,25000000.0,0,-0.883978,0.860564,0.302044,Aidos Kuneen,1
DAPS,Dagger,True,PoW/PoS/PoA,62319462900.0,70000000000,0.869318,-2.174648,7.328832,DAPS Coin,2
ZANO,ProgPowZ,True,PoW/PoS,13095648.382489,-1,0.593062,-1.926892,-1.73532,Zano,3


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [45]:
# Scale data to create the scatter plot
mm_scaler = MinMaxScaler()
plot_data = mm_scaler.fit_transform(
    clustered_df[["MaxSupply", "TotalCoinsMined"]]
)
plot_df = pd.DataFrame(
    plot_data, columns=["MaxSupply", "TotalCoinsMined"], index=clustered_df.index
)
plot_df["CoinName"] = clustered_df["CoinName"]
plot_df["Class"] = clustered_df["Class"]
plot_df.head()

Unnamed: 0,MaxSupply,TotalCoinsMined,CoinName,Class
42,2.047619e-12,0.0,42 Coin,3
NSR,4.761905e-14,6.236426e-06,NuShares,3
TRI,4.761905e-14,1.93514e-10,Triangles Coin,3
CMTC,4.761905e-14,8.81604e-10,CometCoin,1
CHAT,0.0,1.010101e-06,OpenChat,3


In [46]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
plot_df.hvplot.scatter(
    x="TotalCoinsMined", y="MaxSupply", hover_cols=["CoinName"], by="Class"
)

#### Table of Tradable Cryptocurrencies

In [47]:
# Table with tradable cryptos
clustered_df[
    [
        "CoinName",
        "Algorithm",
        "ProofType",
        "MaxSupply",
        "TotalCoinsMined",
        "Class",
    ]
].hvplot.table()

In [48]:
# Print the total number of tradable cryptocurrencies
print(f"There are {clustered_df.shape[0]} tradable cryptocurrencies.")

There are 133 tradable cryptocurrencies.
