# Clustering Crypto

In [None]:
# Initial imports
import pandas as pd
import hvplot.pandas
# from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Deliverable 1: Preprocessing the Data for PCA

In [None]:
# Load the crypto_data.csv dataset.
def create_dataframe(path_name, file_name):
    from pathlib import Path
    input_file_path = Path(path_name, file_name)
    df =  pd.read_csv(input_file_path, index_col=0)
    return df

def drop_column(df, col_name):
    df = df.drop([col_name], axis=1)
    return df

def data_needs_cleaning(df):
    null_items = {}
    null_items=crypto_df.isna().sum()

    # print(null_items)
    
    for null_value in null_items.values:
        if null_value > 0:
            return True

def drop_null_entries(df):
    df = df.dropna()
    return df

def drop_crypto_not_mined(df, col_name):
    cc_not_mined = df[crypto_df[col_name]==0].index
    df.drop(cc_not_mined, inplace=True)
    return df

def show_dataframe_shape(df):
    df_shape = []
    df_shape = df.shape
    print(f"rows in dataset: {df_shape[0]}")
    print(f"columns in dataset: {df_shape[1]}\n")

In [None]:
# Load the crypto_data.csv dataset.
crypto_df = create_dataframe('../resources/', 'crypto_data.csv')


# Keep all the cryptocurrencies that are being traded.
crypto_df = crypto_df[crypto_df['IsTrading'] == True]
crypto_df = drop_column(crypto_df, 'IsTrading')
# show_dataframe_shape(crypto_df)

if data_needs_cleaning(crypto_df): 
    crypto_df = drop_null_entries(crypto_df)

# Filter the crypto_df DataFrame so it only has rows where coins have been mined.
crypto_df = drop_crypto_not_mined(crypto_df, 'TotalCoinsMined')
# show_dataframe_shape(crypto_df)
# crypto_df.info()
# display(crypto_df)

# Keep all the cryptocurrencies that have a working algorithm.
crypto_df = crypto_df.dropna(axis=0, subset=['Algorithm'])

# Create a new DataFrame that holds only the cryptocurrencies names.
coins_df = pd.DataFrame(crypto_df['CoinName'])
# display(coins_df.shape)
# display(coins_df.head(5))

n_crypto_df=drop_column(crypto_df, 'CoinName')

# Use the get_dummies() method to create variables for the two text features, 
# Algorithm and ProofType, and store the resulting data in a new DataFrame named X.
X = pd.get_dummies(n_crypto_df, columns=["Algorithm", "ProofType"])
display(X)

# Standardize the data with StandardScaler().
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
display(X_scaled)


### Deliverable 2: Reducing Data Dimensions Using PCA

In [None]:
# Using PCA to reduce dimension to three principal components.
pca=PCA(n_components  = 3)
X_pca = pca.fit_transform(X)
X_pca

In [None]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(data=X_pca, columns =['PC 1', 'PC 2', 'PC 3'], index=crypto_df.index)
display(pcs_df.shape)
display(pcs_df)

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1,11))
for i in k:
    k_means = KMeans(n_clusters=i, random_state=0)
    k_means.fit(pcs_df)
    inertia.append(k_means.inertia_)

# plot the elbow curve
e_curv_data  = {'k':k, 'inertia': inertia}
e_curv_df = pd.DataFrame(e_curv_data)
e_curv_df.hvplot.line(x='k', y='inertia', title= 'Delivery 2 - Elbow Curve', xticks=k)

Running K-Means with `k=4`

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=4)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
predictions

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
clustered_df = pd.DataFrame(crypto_df, index = crypto_df.index)
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df['PC 1']= pcs_df['PC 1']
clustered_df['PC 2']= pcs_df['PC 2']
clustered_df['PC 3']= pcs_df['PC 3']
display(clustered_df)
#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df['CoinName'] = coins_df['CoinName']
clustered_df['Class'] = model.labels_
#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig1 = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    hover_name="CoinName",
    hover_data=["Algorithm"])
fig1.update_layout(legend=dict(x=0,y=1))
fig1.show()

In [None]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class'])

In [None]:
# Print the total number of tradable cryptocurrencies.
print(f"There are {clustered_df['CoinName'].count()} tradable cryptocurrencies." )

In [None]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
tradable_currency_df = pd.DataFrame(clustered_df[['TotalCoinSupply', 'TotalCoinsMined']])
display(tradable_currency_df)
X_tc_scaled = MinMaxScaler().fit_transform(tradable_currency_df)
X_tc_scaled

In [None]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
n_scaled_df = pd.DataFrame(data=X_tc_scaled, columns=['TotalCoinSupply', 'TotalCoinsMined'], index= tradable_currency_df.index.tolist())


# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
n_scaled_df=n_scaled_df.join(coins_df, how='inner')
n_scaled_df=n_scaled_df.join(clustered_df['Class'], how='inner')
display(n_scaled_df)

In [None]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
n_scaled_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="Class",
                          xlabel="Total Cryptocurrency Coins Mined",
                          ylabel="Total Cryptocurrency Coin Supply",
                          )