# Clustering Crypto

In [74]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [75]:
# Load the crypto_data.csv dataset.
file = Path('resources/crypto_data.csv')
crypto_df = pd.read_csv(file)
crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


###  Explore Data

In [76]:
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [77]:
# Check for duplicates
crypto_df.duplicated().sum()

0

In [78]:
# Cleanup "un-named" column & set as index per screenshots
crypto_df = crypto_df.rename(columns={'Unnamed: 0': ''})
crypto_df = crypto_df.set_index('')

crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,,
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365,365Coin,X11,True,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,True,PoW,,611000.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359.0
2015,2015 coin,X11,True,PoW/PoS,,0.0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,True,PoW,107684200.0,0.0


In [79]:
# Keep all the cryptocurrencies that are being traded.
trading_crypto_df =  crypto_df[crypto_df['IsTrading'] == True]
trading_crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,,
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610


In [80]:
# Remove the "IsTrading" column.
del trading_crypto_df['IsTrading']
trading_crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [81]:
# Remove rows that have at least 1 null value.
trading_crypto_df = trading_crypto_df.dropna(how='any',axis=0)
trading_crypto_df.count()

CoinName           685
Algorithm          685
ProofType          685
TotalCoinsMined    685
TotalCoinSupply    685
dtype: int64

In [82]:
# Keep the rows where coins are mined.
trading_crypto_df = trading_crypto_df[trading_crypto_df['TotalCoinsMined'] > 0]
trading_crypto_df.count()

CoinName           532
Algorithm          532
ProofType          532
TotalCoinsMined    532
TotalCoinSupply    532
dtype: int64

In [83]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_names_df = trading_crypto_df.filter(['CoinName'], axis=1)
crypto_names_df.head()

Unnamed: 0,CoinName
,
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [84]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
del trading_crypto_df['CoinName']
trading_crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159265359.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0


In [85]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(trading_crypto_df,columns=['Algorithm','ProofType'])
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
,,,,,,,,,,,,,,,,,,,,,
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled)

[[-0.11710817 -0.1528703  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.09396955 -0.145009   -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [ 0.52494561  4.48942416 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 ...
 [-0.09561336 -0.13217937 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11694817 -0.15255998 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11710536 -0.15285552 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]]


### Deliverable 2: Reducing Data Dimensions Using PCA

In [87]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
X_pca

array([[-0.33110883,  0.96663609, -0.57437474],
       [-0.31446701,  0.96695272, -0.574769  ],
       [ 2.30350202,  1.71104903, -0.68962661],
       ...,
       [ 0.32316363, -2.25582724,  0.38646395],
       [-0.16018247, -2.06490677,  0.42717687],
       [-0.3046009 ,  0.72581359, -0.28200247]])

In [88]:
# Create a DataFrame with the three principal components.
# create dataframe from numpy array
reduced_dim_df = pd.DataFrame(data=X_pca, columns=["PC 1","PC 2", "PC 3"], index= trading_crypto_df.index)

reduced_dim_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
,,,
42,-0.331109,0.966636,-0.574375
404,-0.314467,0.966953,-0.574769
1337,2.303502,1.711049,-0.689627
BTC,-0.139564,-1.313799,0.208151
ETH,-0.143149,-2.009129,0.376493


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [89]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1,11))
for i in k:
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(reduced_dim_df)
    inertia.append(kmeans.inertia_)

elbow_data = {'k' : k, 'inertia' : inertia}
elbow_data_df = pd.DataFrame(elbow_data)
elbow_data_df.hvplot.line(x='k',y='inertia',title = 'Elbow Curve',xticks=k )

Running K-Means with `k=4`

In [90]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4,random_state=0)

# Fit the model
model.fit(reduced_dim_df)

# Predict clusters
predictions = model.predict(reduced_dim_df)

predictions

array([0, 0, 0, 3, 3, 3, 0, 3, 3, 3, 0, 3, 0, 0, 3, 0, 3, 3, 0, 0, 3, 3,
       3, 3, 3, 0, 3, 3, 3, 0, 3, 0, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0,
       3, 3, 3, 3, 3, 0, 0, 3, 0, 3, 3, 3, 3, 0, 3, 3, 0, 3, 0, 0, 0, 3,
       3, 3, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 3, 0, 3, 0, 0, 3, 3, 3, 3, 0,
       0, 3, 0, 3, 3, 0, 0, 3, 0, 0, 3, 3, 0, 0, 3, 0, 0, 3, 0, 3, 0, 3,
       0, 3, 0, 0, 3, 3, 0, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, 0,
       3, 0, 3, 3, 0, 3, 0, 3, 0, 0, 3, 3, 0, 3, 3, 0, 0, 3, 0, 3, 0, 0,
       0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 3, 0, 3, 0, 0, 3, 0, 3, 0, 0, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0,
       3, 0, 0, 0, 0, 0, 3, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 3, 0, 3, 0,
       0, 3, 0, 3, 3, 0, 3, 3, 0, 3, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0,
       0, 0, 3, 0, 3, 0, 0, 0, 0, 3, 0, 3, 0, 3, 3, 3, 3, 0, 3, 0, 0, 3,
       0, 3, 3, 3, 0, 3, 0, 3, 3, 3, 0, 3, 0, 3, 0,

In [91]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat((trading_crypto_df,reduced_dim_df),axis=1)

# #  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies.
clustered_df['CoinName'] = crypto_names_df
#
# #  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = predictions
#
# # Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
42,Scrypt,PoW/PoS,41.99995,42.0,-0.331109,0.966636,-0.574375,42 Coin,0.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.314467,0.966953,-0.574769,404Coin,0.0
1337,X13,PoW/PoS,29279420000.0,314159265359.0,2.303502,1.711049,-0.689627,EliteCoin,0.0
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.139564,-1.313799,0.208151,Bitcoin,3.0
ETH,Ethash,PoW,107684200.0,0.0,-0.143149,-2.009129,0.376493,Ethereum,3.0
LTC,Scrypt,PoW,63039240.0,84000000.0,-0.16602,-1.196263,-0.011892,Litecoin,3.0
DASH,X11,PoW/PoS,9031294.0,22000000.0,-0.391961,1.361125,-0.559093,Dash,0.0
XMR,CryptoNight-V7,PoW,17201140.0,0.0,-0.154399,-2.215772,0.424774,Monero,3.0
ETC,Ethash,PoW,113359700.0,210000000.0,-0.141592,-2.009207,0.376471,Ethereum Classic,3.0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [92]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x='PC 1',
    y = 'PC 2',
    z = 'PC 3',
    color = 'Class',
    symbol='Class',
    width = 800,
    hover_name = 'CoinName',
    hover_data = ['Algorithm'])

fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [93]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(sortable=True, selectable=True)

In [94]:
# Print the total number of tradable cryptocurrencies
index_series  = clustered_df.index
tradable_str_count = str(len(index_series))
print(f"There are " + tradable_str_count + " tradable cryptocurrencies. ")

There are 532 tradable cryptocurrencies. 


In [95]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
minmax_scaler = MinMaxScaler()
scaled_clustered = minmax_scaler.fit_transform(clustered_df[['TotalCoinSupply', 'TotalCoinsMined']])
type(scaled_clustered)


numpy.ndarray

In [96]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
scaled_clustered_df = pd.DataFrame(
        data = scaled_clustered,
        columns=['TotalCoinSupply', 'TotalCoinsMined'], index= clustered_df.index)


# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
scaled_clustered_df["CoinName"] = clustered_df["CoinName"]

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
scaled_clustered_df["Class"] = clustered_df["Class"]

scaled_clustered_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
,,,,
42,4.2e-11,0.0,42 Coin,0.0
404,0.000532,0.001066,404Coin,0.0
1337,0.3141593,0.029576,EliteCoin,0.0
BTC,2.1e-05,1.8e-05,Bitcoin,3.0
ETH,0.0,0.000109,Ethereum,3.0
LTC,8.4e-05,6.4e-05,Litecoin,3.0
DASH,2.2e-05,9e-06,Dash,0.0
XMR,0.0,1.7e-05,Monero,3.0
ETC,0.00021,0.000115,Ethereum Classic,3.0


In [97]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
scaled_clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply",by="Class", hover_cols=["CoinName"])