In [1]:
# Import the dependencies

import pandas as pd

In [2]:
# Load the data set

crypto_df = pd.read_csv("Resources/crypto_data.csv")
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
# Check the dimensions of the dataframe

crypto_df.shape

(1252, 7)

In [4]:
# Check the data types in the dataframe

crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

### Preprocessing

In [5]:
# Rename the Unnamed: 0 columns to CoinCode
columns = ["CoinCode","CoinName","Algorithm","IsTrading","ProofType","TotalCoinsMined","TotalCoinSuppy"]
crypto_df.columns = columns
crypto_df.head()

Unnamed: 0,CoinCode,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSuppy
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


#### Remove all cryptocurrencies that are not on trading

In [6]:
# Retain the rows where IsTrading is True

crypto_df = crypto_df[crypto_df["IsTrading"] == True]

crypto_df["IsTrading"].unique()

array([ True])

In [7]:
# Show the first few rows of the data

crypto_df.head()

Unnamed: 0,CoinCode,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSuppy
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [8]:
# Recheck the dimensions of the dataframe

crypto_df.shape

(1144, 7)

#### Remove all cryptocurrencies that have not an algorithm defined

In [9]:
# Number of cryptocurrencies that have no algorithm defined

crypto_df[crypto_df.isna()].sum()

CoinCode           0.0
CoinName           0.0
Algorithm          0.0
IsTrading          0.0
ProofType          0.0
TotalCoinsMined    0.0
TotalCoinSuppy     0.0
dtype: float64

#### Remove the IsTrading column

In [10]:
# Show the columns before

crypto_df.head()

Unnamed: 0,CoinCode,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSuppy
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [11]:
# Remove the isTrading column

crypto_df.drop(columns=["IsTrading"],inplace=True)

In [12]:
# Show the columns after

crypto_df.head()

Unnamed: 0,CoinCode,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSuppy
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


#### Remove all cryptocurrencies with at least one null value.

In [13]:
# List the cryptocurrencies with at least one null value

crypto_df[crypto_df.isnull().any(axis=1)].shape

(459, 6)

In [14]:
# Drop all the rows with at least one null value

crypto_df.dropna(inplace=True)

crypto_df.shape

(685, 6)

#### Remove all cryptocurrencies without coins mined.

In [15]:
# Drop all rows where TotalCoinsMined is 0

crypto_df = crypto_df[crypto_df["TotalCoinsMined"] != 0]

crypto_df.head()

Unnamed: 0,CoinCode,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSuppy
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [16]:
# Recheck the dimensions of the dataframe

crypto_df.shape

(533, 6)

#### Store the names of all cryptocurrencies in a DataFrame named coins_name, using the crypto_df.index as the index

In [17]:
coins_name = pd.DataFrame(crypto_df["CoinName"],index=crypto_df.index)

#### Remove the CoinName column.

In [18]:
crypto_df.drop(columns=["CoinName"],inplace=True)
crypto_df.head()

Unnamed: 0,CoinCode,Algorithm,ProofType,TotalCoinsMined,TotalCoinSuppy
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


#### Create dummies variables for all the text features, store the resulting data on a DataFrame named X

In [19]:
X = pd.get_dummies(crypto_df,columns=["CoinCode","Algorithm","ProofType"])
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSuppy,CoinCode_1337,CoinCode_1CR,CoinCode_404,CoinCode_42,CoinCode_8BIT,CoinCode_AAC,CoinCode_ABJ,CoinCode_ABS,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159265359,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Use the StandardScaler from sklearn to standardize all the data of the X DataFrame

In [20]:
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(X)