In [102]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering, KMeans
import hvplot.pandas
import plotly.express as px

In [103]:
# Load data
file = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


# Data Preprocessing

In [104]:
# Remove all cryptocurrencies that aren't trading
crypto_df = crypto_df[crypto_df["IsTrading"] == True]
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [105]:
# Remove all cryptocurrencies that don’t have an algorithm defined
crypto_df['Algorithm'].isnull().sum()

0

In [106]:
# Remove the IsTrading column
crypto_df = crypto_df.drop("IsTrading", axis=1)

In [107]:
# Remove all cryptocurrencies with at least one null value
crypto_df.isnull().sum()

Unnamed: 0           0
CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      0
dtype: int64

In [108]:
# Remove all cryptocurrencies without coins mined
crypto_df = crypto_df.dropna()
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [68]:
# Store the names of all cryptocurrencies on a DataFramed named coins_name,
# and use the crypto_df.index as the index for this new DataFrame
coins_name = pd.DataFrame(crypto_df[["Unnamed: 0", "CoinName"]])
coins_name.set_index("Unnamed: 0", drop = True, inplace = True)
coins_name.head()

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
808,808
1337,EliteCoin
BTC,Bitcoin


In [69]:
# Remove the CoinName column
cleaned_crypto_df = crypto_df.drop(columns = ["CoinName"])
cleaned_crypto_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,SHA-256,PoW/PoS,0.0,0
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000


In [70]:
cleaned_crypto_df.dtypes

Unnamed: 0          object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [71]:
cleaned_crypto_df["TotalCoinSupply"] = cleaned_crypto_df["TotalCoinSupply"].astype('float')

In [72]:
# Create dummies variables for all of the text features, and store the 
# resulting data on a DataFrame
X = pd.get_dummies(cleaned_crypto_df[["Algorithm","ProofType"]])

In [73]:
# Standardize all of the data from the X DataFrame
scale_model = StandardScaler()
scaled_X = scale_model.fit_transform(X)

# Reducing Data Dimensions Using PCA

In [85]:
# Reducing X DataFrame Dimensions Using PCA to 3 features
pca = PCA(n_components=3, random_state = 1)
X_pca = pca.fit_transform(scaled_X)
print(f'PCA ratio: {pca.explained_variance_ratio_}')

PCA ratio: [0.01956545 0.01875421 0.01871681]


In [75]:
pca.explained_variance_

array([2.09656331, 2.00963455, 2.00562694])

In [89]:
pcs_df = pd.DataFrame(X_pca, index=cleaned_crypto_df["Unnamed: 0"], columns=["PC 1","PC 2","PC 3"])
pcs_df.head(10)

Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-1.303547,-0.406755,-0.003655
404,-1.303547,-0.406755,-0.003655
808,-0.776747,-0.249798,0.001913
1337,-1.928004,-0.4627,0.001467
BTC,1.5075,0.115764,0.007744
ETH,2.152076,0.189653,-0.005396
LTC,0.980701,-0.041193,0.002177
DASH,-1.353609,-0.384371,-0.006037
XMR,2.14739,0.287987,0.02076
ETC,2.152076,0.189653,-0.005396


# Clustering Cryptocurrencies Using K-means

In [90]:
# Create an elbow curve to find the best value for K, and use the pcs_df DataFrame
inertia_list = list()
k_value = list(range(1,11))

for k in k_value:
    k_model = KMeans(n_clusters=k, random_state=1)
    k_model.fit(pcs_df)
    inertia_list.append(k_model.inertia_)

elbow_df = pd.DataFrame({"K": k_value, "Inertia": inertia_list})

In [91]:
# Elbow Curve
elbow_df.hvplot.line(x = "K", y = "Inertia", xticks = k_value)

Looking at the elbow curve, the line shifts to a horizontal line at point 4, thus K=4 is the best estimate number of cluster for the KMeans model.

In [92]:
# Run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data
model = KMeans(n_clusters=4, random_state=1)
predictions = model.fit_predict(pcs_df)

In [93]:
# Create a new DataFrame named “clustered_df,” that includes the following columns: 
# Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply, PC 1, PC 2, PC 3, CoinName, and Class
clustered_df = cleaned_crypto_df.merge(pcs_df, on = "Unnamed: 0")
clustered_df = clustered_df.merge(coins_name, on = "Unnamed: 0")

clustered_df["Class"] = model.labels_

clustered_df.set_index("Unnamed: 0", drop = True, inplace = True)
clustered_df.head(10)

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42.0,-1.303547,-0.406755,-0.003655,42 Coin,1
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-1.303547,-0.406755,-0.003655,404Coin,1
808,SHA-256,PoW/PoS,0.0,0.0,-0.776747,-0.249798,0.001913,808,1
1337,X13,PoW/PoS,29279420000.0,314159300000.0,-1.928004,-0.4627,0.001467,EliteCoin,1
BTC,SHA-256,PoW,17927180.0,21000000.0,1.5075,0.115764,0.007744,Bitcoin,0
ETH,Ethash,PoW,107684200.0,0.0,2.152076,0.189653,-0.005396,Ethereum,0
LTC,Scrypt,PoW,63039240.0,84000000.0,0.980701,-0.041193,0.002177,Litecoin,0
DASH,X11,PoW/PoS,9031294.0,22000000.0,-1.353609,-0.384371,-0.006037,Dash,1
XMR,CryptoNight-V7,PoW,17201140.0,0.0,2.14739,0.287987,0.02076,Monero,0
ETC,Ethash,PoW,113359700.0,210000000.0,2.152076,0.189653,-0.005396,Ethereum Classic,0


# Visualizing Results

In [97]:
# Create 3D scatter plot to plot the clusters using the clustered_df DataFrame
fig = px.scatter_3d(clustered_df, x= "PC 1", y="PC 2",z="PC 3",
                    color="Class", symbol="Class", hover_name="CoinName",
                    hover_data=["Algorithm"])
fig.update_layout(legend = {"x":0,"y":1})
fig.show()

In [98]:
# Create a hvplot table for all the current tradable cryptocurrencies
obj_table = clustered_df.hvplot.table(columns = ["CoinName", "Algorithm", 
                                    "ProofType", "TotalCoinSupply", 
                                    "TotalCoinsMined", "Class"], width =500)

hvplot.show(obj_table)

Launching server at http://localhost:64261


In [100]:
# Create a scatter plot to present the clustered data about cryptocurrencies 
clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply",
                                by = "Class", hover_cols = ["CoinName"])