In [3]:
# Initial imports
import pandas as pd
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Load the crypto_data.csv file
crypto_data = pd.read_csv(Path("crypto_data.csv"), index_col=0)
crypto_data.head()

In [None]:
crypto_data.info()

In [None]:
# Convert 'TotalCoinSupply' to numeric
crypto_data['TotalCoinSupply'] = pd.to_numeric(crypto_data['TotalCoinSupply'], errors='coerce')
crypto_data.info()

In [None]:
# Discard all cryptocurrencies that are not being traded
crypto_data = crypto_data[crypto_data['IsTrading'] == True]
crypto_data.info()

In [None]:
# Remove the 'IsTrading' column
crypto_data = crypto_data.drop(columns=['IsTrading'])
crypto_data.head()

In [None]:
# Remove all rows with missing values
crypto_data = crypto_data.dropna()
crypto_data.head()

In [None]:
# Filter for cryptocurrencies that have been mined
crypto_data = crypto_data[crypto_data['TotalCoinsMined'] > 0]
crypto_data.info()

In [None]:
# Remove the 'CoinName' column
coin_names = crypto_data['CoinName']
crypto_data = crypto_data.drop(columns=['CoinName'])
crypto_data.info()

In [None]:
# Use dummy variables for text columns
X = pd.get_dummies(crypto_data, columns=['Algorithm', 'ProofType'])
print(X.shape)
X.head()

In [None]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled.shape)

In [None]:
# Perform dimensionality reduction with PCA
pca = PCA(n_components=0.90)
crypto_pca = pca.fit_transform(X_scaled)
print(crypto_pca.shape)

In [None]:
# Further reduce the dimensions with t-SNE
tsne = TSNE(n_components=2, random_state=42)
crypto_tsne = tsne.fit_transform(crypto_pca)
print(crypto_tsne.shape)

In [None]:
# Visualize the results from t-SNE
plt.figure(num=None, figsize=(10,10), facecolor='w', edgecolor='k')
plt.scatter(crypto_tsne[:, 0], crypto_tsne[:, 1])
plt.title('t-SNE visualization of cryptocurrencies', fontdict={'fontsize': 20})
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.show()

In [None]:
plt.figure(num=None, figsize=(10,10), facecolor='w', edgecolor='k')
plt.scatter(crypto_tsne[:, 0], crypto_tsne[:, 1])
plt.title('t-SNE visualization of cryptocurrencies', fontdict={'fontsize': 20})
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

plt.scatter(5, 12, color = 'red', marker = 'o', s = 20000, alpha = 0.2)
plt.scatter(-18, -5, color = 'blue', marker = 'o', s = 18000, alpha = 0.2)
plt.scatter(4, -8, color = 'yellow', marker = 'o', s = 10000, alpha = 0.2)
plt.scatter(28, -18, color = 'green', marker = 'o', s = 5000, alpha = 0.2)
plt.show()

In [None]:
# Find the best value for k using the elbow curve

inertia = []
k = list(range(1, 11))
for i in k:
    km =  KMeans(n_clusters=i, random_state=0)
    km.fit(crypto_tsne)
    inertia.append(km.inertia_)

elbow_df = pd.DataFrame({'k': k, 'inertia': inertia})
elbow_df.plot(x='k', y='inertia', kind='line')

In [None]:
# Perform clustering with k=4
km = KMeans(n_clusters=4, random_state=0)
km.fit(crypto_tsne)
predictions = km.predict(crypto_tsne)

In [None]:
# Visualize the results
plt.figure(num=None, figsize=(10,10), facecolor='w', edgecolor='k')
plt.scatter(crypto_tsne[:, 0], crypto_tsne[:, 1], c = predictions)
plt.title('Cryptocurrency clustering by k-means', fontdict={'fontsize': 20})
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')