# Clustering Crypto

In [25]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [62]:
# Load the crypto_data.csv dataset.
filepath = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(filepath)
#set index as the Unnamed: 0 column
crypto_df = crypto_df.set_index('Unnamed: 0')
crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [63]:
# Keep all the cryptocurrencies that are being traded.
crypto_df = crypto_df[crypto_df.IsTrading == True]
crypto_df


Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [64]:
# Keep all the cryptocurrencies that have a working algorithm.
crypto_df = crypto_df[crypto_df.Algorithm != "NaN"]
crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [65]:
# Remove the "IsTrading" column. 
crypto_df = crypto_df.drop(['IsTrading'], axis=1)
crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [66]:
#find null values
for column in crypto_df.columns:
    #print what columns have null values
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")

Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 459 null values
Column TotalCoinSupply has 0 null values


In [67]:
# Remove rows that have at least 1 null value.
crypto_df = crypto_df.dropna()
#print to check the 459 null values from prev step have been dropped
crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [68]:
# Keep the rows where coins are mined.
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] > 1]
crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [69]:
# Create a new DataFrame that holds only the cryptocurrencies names.
#make a copy of the original df
crypto_names_df = crypto_df.copy()
#make new df only contain the coin name column
crypto_names_df = crypto_names_df[['CoinName']]
crypto_names_df

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [70]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
crypto_df = crypto_df.drop(['CoinName'], axis=1)
#also dropping alternative/shortened coin name
crypto_df = crypto_df.drop(['Unnamed: 0'], axis=1)

KeyError: "['Unnamed: 0'] not found in axis"

In [None]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(crypto_df, columns=['Algorithm','ProofType'])

In [71]:
# Standardize the data with StandardScaler().
scaler = StandardScaler()

#fit the scaler
X_scaled = scaler.fit_transform(X)


### Deliverable 2: Reducing Data Dimensions Using PCA

In [79]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
#transform the scaled data
crypto_pca = pca.fit_transform(X_scaled)
crypto_pca

array([[-0.33474894,  1.01449442, -0.4431323 ],
       [-0.31796679,  1.01473581, -0.44359014],
       [ 2.32557716,  1.67304854, -0.52303116],
       ...,
       [ 0.32238568, -2.30483766,  0.28197055],
       [-0.14303198, -2.04390382,  0.34908836],
       [-0.28173481,  0.81686481, -0.23220782]])

In [80]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(crypto_pca, columns=['PrincipleComponent1', 'PrincipleComponent2', 'PrincipleComponent3'], index=crypto_df.index)
pcs_df

Unnamed: 0_level_0,PrincipleComponent1,PrincipleComponent2,PrincipleComponent3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.334749,1.014494,-0.443132
404,-0.317967,1.014736,-0.443590
1337,2.325577,1.673049,-0.523031
BTC,-0.150845,-1.348159,0.146372
ETH,-0.145714,-2.120538,0.192225
...,...,...,...
ZEPH,2.380381,0.702725,0.200556
GAP,-0.332779,1.014390,-0.443183
BDX,0.322386,-2.304838,0.281971
ZEN,-0.143032,-2.043904,0.349088


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [81]:
# Create an elbow curve to find the best value for K.
#create empty list to hold inertia values and a range for k 
inertia = []
k = list(range(1,11))

#loop through k values to find inertia
for i in k:
    #init KMeans with # cluster =i
    km=KMeans(n_clusters=i, random_state=0)
    #fit to the pca dataframe
    km.fit(pcs_df)
    #add that inertia to the list
    inertia.append(km.inertia_)

#plot the elbow graph
#create dictionary with kmeans values
elbow_crypto = {"k":k, "inertia":inertia}
#create a dataframe from the dictionary
elbow_crypto_df = pd.DataFrame(elbow_crypto)

#plot the elbow graph
elbow_crypto_df.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")



  "KMeans is known to have a memory leak on Windows "


Running K-Means with `k=4`

In [82]:
# Initialize the K-Means model.
    #using 4 clusters bc of results of elbow graph
km = KMeans(n_clusters=4, random_state=0)

# Fit the model
model = km.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

In [83]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([crypto_df, pcs_df], axis=1, join='inner')

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df['CoinName'] = crypto_names_df['CoinName']

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = predictions

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PrincipleComponent1,PrincipleComponent2,PrincipleComponent3,CoinName,Class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42,-0.334749,1.014494,-0.443132,42 Coin,3
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.317967,1.014736,-0.44359,404Coin,3
1337,X13,PoW/PoS,29279420000.0,314159265359,2.325577,1.673049,-0.523031,EliteCoin,3
BTC,SHA-256,PoW,17927180.0,21000000,-0.150845,-1.348159,0.146372,Bitcoin,0
ETH,Ethash,PoW,107684200.0,0,-0.145714,-2.120538,0.192225,Ethereum,0
LTC,Scrypt,PoW,63039240.0,84000000,-0.164225,-1.142675,-0.033003,Litecoin,0
DASH,X11,PoW/PoS,9031294.0,22000000,-0.395307,1.247046,-0.37053,Dash,3
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.154279,-2.230357,0.241753,Monero,0
ETC,Ethash,PoW,113359700.0,210000000,-0.144144,-2.120629,0.192185,Ethereum Classic,0
ZEC,Equihash,PoW,7383056.0,21000000,-0.143031,-2.043904,0.349088,ZCash,0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE


In [None]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE

In [None]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE

In [None]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE

In [None]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df.head(10)

In [None]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
