# Import the Libraries and Load the Data

In [1]:
# Initial Imports
import pandas as pd
from pandas import DataFrame
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Load the data
file_path = Path('./data/crypto_data.csv')
crypto_df: DataFrame = pd.read_csv(file_path, index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


# Data Preprocessing

## Quick Look at the DataFrame

In [3]:
# Return a tuple representing the dimensionality of the DataFrame
crypto_df.shape

(1252, 6)

In [4]:
# The column labels of the DataFrame
crypto_df.columns

Index(['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined',
       'TotalCoinSupply'],
      dtype='object')

In [5]:
# Confirm the datatypes
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

## Clean the DataFrame

In [6]:
# Count all IsTrading unique values
crypto_df["IsTrading"].value_counts()

True     1144
False     108
Name: IsTrading, dtype: int64

In [7]:
# Remove all cryptocurrencies that aren’t trading
crypto_df = crypto_df.drop(crypto_df[crypto_df["IsTrading"] == False].index)
    # Alternative code: crypto_df = crypto_df[crypto_df["IsTrading"] == True]

# Display the updated dimensionality of the DataFrame
crypto_df.shape

(1144, 6)

In [8]:
# Count all Algorithm unique values
print("Algorithm - Unique Value Counts:")
print(crypto_df["Algorithm"].value_counts())

# Display all the Algorithm names
print("----------")
print("Algorithm - Names:")
print(crypto_df["Algorithm"].unique())

Algorithm - Unique Value Counts:
Scrypt            394
X11               182
SHA-256           121
X13                54
PoS                42
                 ... 
XG Hash             1
Exosis              1
QuarkTX             1
SHA-256 + Hive      1
QUAIT               1
Name: Algorithm, Length: 89, dtype: int64
----------
Algorithm - Names:
['Scrypt' 'X11' 'SHA-256' 'X13' 'Ethash' 'CryptoNight-V7' 'Equihash'
 'SHA-512' 'Multiple' 'X15' 'NIST5' 'Quark' 'Groestl' 'PoS' 'NeoScrypt'
 'SHA3' 'HybridScryptHash256' 'Scrypt-n' 'PHI1612' 'Lyra2REv2'
 'CryptoNight' 'Shabal256' 'Counterparty' 'Blake' 'Momentum'
 'Stanford Folding' 'QuBit' 'XG Hash' 'M7 POW' 'Curve25519' 'Lyra2RE'
 'QUAIT' 'vDPOS' 'Blake2b' 'BLAKE256' '1GB AES Pattern Search' 'Dagger'
 'CryptoNight-Lite' 'X11GOST' 'SHA-256D' 'POS 3.0' 'Progressive-n' 'DPoS'
 'Lyra2Z' 'X14' 'Time Travel' 'Argon2' 'Keccak' 'Blake2S'
 'Dagger-Hashimoto' '536' 'Argon2d' 'Cloverhash' 'Skein'
 'SkunkHash v2 Raptor' 'VeChainThor Authority' 'Ouroboros

In [9]:
# Remove all cryptocurrencies that don’t have an algorithm defined

# Drop any null values
crypto_df["Algorithm"] = crypto_df["Algorithm"].dropna(axis=0)

# Drop Algorithm defined as "Multiple"
crypto_df = crypto_df.drop(crypto_df[crypto_df["Algorithm"] == "Multiple"].index)

# Display the DataFrame
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [10]:
# Remove the IsTrading column
crypto_df.drop('IsTrading', axis=1, inplace=True)

# Display the DataFrame
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [11]:
# Find null values
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")

Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 452 null values
Column TotalCoinSupply has 0 null values


In [12]:
# Remove all cryptocurrencies with at least one null value
crypto_df = crypto_df.dropna(axis=0)

# Display the DataFrame
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,3.14159E+11
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [13]:
# Confirm NO null values (Detect the non-missing values)
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].notnull().sum()} non-missing values")

Column CoinName has 674 non-missing values
Column Algorithm has 674 non-missing values
Column ProofType has 674 non-missing values
Column TotalCoinsMined has 674 non-missing values
Column TotalCoinSupply has 674 non-missing values


In [14]:
# Remove all cryptocurrencies without coins mined
crypto_df = crypto_df.drop(crypto_df[crypto_df["TotalCoinsMined"] <= 0].index)

# Display the DataFrame
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,3.14159E+11
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [15]:
# Store the names of all cryptocurrencies on a DataFramed named coins_name
coins_name_list = crypto_df["CoinName"]
coins_name: DataFrame = pd.DataFrame(coins_name_list)

# and use the crypto_df.index as the index for this new DataFrame
coins_name.set_index(crypto_df.index)

# Display the DataFrame
coins_name

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [16]:
# Remove the CoinName column
crypto_df.drop('CoinName', axis=1, inplace=True)

# Display the DataFrame
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159000000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0


## Select the Data for the Features

In [17]:
# Create dummies variables for all of the text features
features_df: DataFrame =  crypto_df[['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply']].copy()

# Display the DataFrame
features_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159000000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0


In [18]:
# Display Datatype
features_df["Algorithm"].apply(type)

42      <class 'str'>
404     <class 'str'>
1337    <class 'str'>
BTC     <class 'str'>
ETH     <class 'str'>
            ...      
ZEPH    <class 'str'>
GAP     <class 'str'>
BDX     <class 'str'>
ZEN     <class 'str'>
XBC     <class 'str'>
Name: Algorithm, Length: 523, dtype: object

In [19]:
# Count the unique values
features_df["Algorithm"].value_counts()

Scrypt                 182
X11                     73
SHA-256                 48
CryptoNight             19
X13                     17
                      ... 
Time Travel              1
Counterparty             1
Semux BFT consensus      1
HybridScryptHash256      1
SkunkHash                1
Name: Algorithm, Length: 70, dtype: int64

In [20]:
# Convert the algorithm value counts into a DataFrame in order to look at the distribution of algorithm types
algorithm_df: DataFrame = features_df["Algorithm"].value_counts()

# Display the DataFrame
algorithm_df.head(20)

Scrypt            182
X11                73
SHA-256            48
CryptoNight        19
X13                17
PoS                17
Quark              13
Equihash           12
NIST5              10
NeoScrypt          10
Ethash              9
DPoS                7
XEVAN               6
CryptoNight-V7      5
Lyra2Z              4
X15                 4
Lyra2REv2           4
Groestl             4
X16R                4
SHA-256D            4
Name: Algorithm, dtype: int64

In [21]:
# Bin the Algorithms
def binAlgorithm(algorithmBinName: str) -> str:
    if algorithmBinName in ['Scrypt', 'X11', 'SHA-256']:
        return algorithmBinName
    else:
        return "Other"

In [22]:
# Apply the binAlgorithm function
features_df["Algorithm"] = features_df["Algorithm"].apply(binAlgorithm)

In [23]:
# Display the updated value counts 
features_df["Algorithm"].value_counts()

Other      220
Scrypt     182
X11         73
SHA-256     48
Name: Algorithm, dtype: int64

In [24]:
# Display Datatype
features_df["ProofType"].apply(type)

42      <class 'str'>
404     <class 'str'>
1337    <class 'str'>
BTC     <class 'str'>
ETH     <class 'str'>
            ...      
ZEPH    <class 'str'>
GAP     <class 'str'>
BDX     <class 'str'>
ZEN     <class 'str'>
XBC     <class 'str'>
Name: ProofType, Length: 523, dtype: object

In [25]:
# Count the unique values
features_df["ProofType"].value_counts()

PoW                     230
PoW/PoS                 176
PoS                      84
DPoS                      9
PoC                       3
PoS/PoW                   2
PoW/PoS                   1
PoA                       1
PoST                      1
Pos                       1
Zero-Knowledge Proof      1
PoS/LPoS                  1
PoW and PoS               1
HPoW                      1
POBh                      1
dPoW/PoW                  1
Proof of Trust            1
LPoS                      1
PoS/PoW/PoT               1
PoW + Hive                1
TPoS                      1
PoW/nPoS                  1
PoW/PoW                   1
Proof of Authority        1
DPOS                      1
Name: ProofType, dtype: int64

In [26]:
# Bin the ProofType
def binProofType(proofTypeBinName: str) -> str:
    if proofTypeBinName in ['PoW', 'PoW/PoS', 'PoS']:
        return proofTypeBinName
    else:
        return "Other"

In [27]:
# Apply the binAlgorithm function
features_df["ProofType"] = features_df["ProofType"].apply(binProofType)

In [28]:
# Display the updated value counts 
features_df["ProofType"].value_counts()

PoW        230
PoW/PoS    176
PoS         84
Other       33
Name: ProofType, dtype: int64

In [29]:
# Display Datatype
features_df["TotalCoinsMined"].apply(type)

42      <class 'float'>
404     <class 'float'>
1337    <class 'float'>
BTC     <class 'float'>
ETH     <class 'float'>
             ...       
ZEPH    <class 'float'>
GAP     <class 'float'>
BDX     <class 'float'>
ZEN     <class 'float'>
XBC     <class 'float'>
Name: TotalCoinsMined, Length: 523, dtype: object

In [30]:
# Display Datatype
features_df["TotalCoinSupply"].apply(type)

42      <class 'str'>
404     <class 'str'>
1337    <class 'str'>
BTC     <class 'str'>
ETH     <class 'str'>
            ...      
ZEPH    <class 'str'>
GAP     <class 'str'>
BDX     <class 'str'>
ZEN     <class 'str'>
XBC     <class 'str'>
Name: TotalCoinSupply, Length: 523, dtype: object

In [31]:
# Convert string to numeric
features_df['TotalCoinSupply'] = pd.to_numeric(features_df['TotalCoinSupply'])

In [32]:
# Display Datatype
features_df["TotalCoinSupply"].apply(type)

42      <class 'float'>
404     <class 'float'>
1337    <class 'float'>
BTC     <class 'float'>
ETH     <class 'float'>
             ...       
ZEPH    <class 'float'>
GAP     <class 'float'>
BDX     <class 'float'>
ZEN     <class 'float'>
XBC     <class 'float'>
Name: TotalCoinSupply, Length: 523, dtype: object

In [33]:
# store the resulting data on a DataFrame named X
X = features_df.copy()

# Display the DataFrame
X

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,4.200000e+01
404,Scrypt,PoW/PoS,1.055185e+09,5.320000e+08
1337,Other,PoW/PoS,2.927942e+10,3.141590e+11
BTC,SHA-256,PoW,1.792718e+07,2.100000e+07
ETH,Other,PoW,1.076842e+08,0.000000e+00
...,...,...,...,...
ZEPH,SHA-256,Other,2.000000e+09,2.000000e+09
GAP,Scrypt,PoW/PoS,1.493105e+07,2.500000e+08
BDX,Other,PoW,9.802226e+08,1.400223e+09
ZEN,Other,PoW,7.296538e+06,2.100000e+07


## Standardize all of the data from the X DataFrame

In [34]:
# Encode target labels with value between 0 and n_classes-1
X['Algorithm'] = LabelEncoder().fit_transform(X['Algorithm'])

In [35]:
# Encode target labels with value between 0 and n_classes-1
X['ProofType'] = LabelEncoder().fit_transform(X['ProofType'])

In [36]:
# Display X
X

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,2,3,4.199995e+01,4.200000e+01
404,2,3,1.055185e+09,5.320000e+08
1337,0,3,2.927942e+10,3.141590e+11
BTC,1,2,1.792718e+07,2.100000e+07
ETH,0,2,1.076842e+08,0.000000e+00
...,...,...,...,...
ZEPH,1,0,2.000000e+09,2.000000e+09
GAP,2,3,1.493105e+07,2.500000e+08
BDX,0,2,9.802226e+08,1.400223e+09
ZEN,0,2,7.296538e+06,2.100000e+07


In [37]:
# Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame
X_scaled = StandardScaler().fit_transform(X)

# Display the DataFrame
X_scaled

array([[ 0.7001809 ,  1.09946066, -0.11684969, -0.153043  ],
       [ 0.7001809 ,  1.09946066, -0.09390435, -0.14524715],
       [-1.0646124 ,  1.09946066,  0.51984097,  4.45059729],
       ...,
       [-1.0646124 , -0.05751706, -0.09553443, -0.13252434],
       [-1.0646124 , -0.05751706, -0.11669102, -0.15273527],
       [ 0.7001809 , -1.21449477, -0.1168469 , -0.15302835]])