In [None]:
%pip install pandas numpy matplotlib seaborn scikit-learn


In [None]:
import os
import glob
import json
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# ─── suppress warnings ─────────────────────────────────────────────────────────
warnings.filterwarnings("ignore")

# ─── directories ───────────────────────────────────────────────────────────────
PROJECT_DIR          = os.path.expanduser("~/User/crypto proj")
DATA_DIR             = os.path.join(PROJECT_DIR, "data")
PREPROC_DIR          = os.path.join(DATA_DIR, "preprocessed")
VIS_DIR              = os.path.join(PROJECT_DIR, "visualizations", "clustering")
CLUSTER_RESULTS_PATH = os.path.join(PROJECT_DIR, "models", "clustering_results.json")

os.makedirs(VIS_DIR, exist_ok=True)
os.makedirs(os.path.dirname(CLUSTER_RESULTS_PATH), exist_ok=True)

# ─── constants ────────────────────────────────────────────────────────────────
COINS   = {'BTC': 'Bitcoin', 'ETH': 'Ethereum', 'DOGE': 'Dogecoin'}
EPS_MAP = {'Bitcoin': 1.5, 'Ethereum': 1.2, 'Dogecoin': 0.5}

# ─── helpers ──────────────────────────────────────────────────────────────────
def load_preprocessed(symbol: str) -> pd.DataFrame:
    """Load most recent date‐stamped preprocessed CSV for a symbol."""
    pattern = os.path.join(PREPROC_DIR, f"{symbol}_preprocessed_*.csv")
    files = glob.glob(pattern)
    if not files:
        raise FileNotFoundError(f"No preprocessed files for {symbol}")
    latest = max(files, key=os.path.getmtime)
    print(f"Loaded {os.path.basename(latest)}")
    return pd.read_csv(latest, parse_dates=["Date"], index_col="Date")

def prepare_clustering_data(df: pd.DataFrame):
    """Select features, drop NaNs, and standard‐scale."""
    cols = [
        'Close','Volume','Daily_Return',
        'Volatility_30','RSI','MACD_Hist',
        'Weekly_Return','MA30'
    ]
    feats = df[cols].select_dtypes(include=[np.number]).dropna()
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(feats)
    return data_scaled, feats

def determine_optimal_clusters(data: np.ndarray, max_k: int = 15) -> int:
    """Compute elbow & silhouette and return k with highest silhouette."""
    ks = range(2, max_k+1)
    inertias, silhouettes = [], []
    for k in ks:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(data)
        inertias.append(km.inertia_)
        silhouettes.append(silhouette_score(data, labels))
    # plot elbow & silhouette
    fig, axes = plt.subplots(1,2, figsize=(12,5))
    axes[0].plot(ks, inertias, 'o-'); axes[0].set(title="Elbow", xlabel="k", ylabel="Inertia")
    axes[1].plot(ks, silhouettes, 'o-'); axes[1].set(title="Silhouette", xlabel="k", ylabel="Score")
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, "elbow_silhouette.png"))
    plt.close()
    best_k = ks[int(np.argmax(silhouettes))]
    print(f" → Optimal k by silhouette: {best_k}")
    return best_k

def perform_clustering(data, k, feats, coin_name, eps):
    """Run KMeans, DBSCAN, Agglomerative, GMM and pick best by silhouette."""
    algorithms = {
        'KMeans': KMeans(n_clusters=k, random_state=42, n_init=10),
        'DBSCAN': DBSCAN(eps=eps, min_samples=7),
        'Agglomerative': AgglomerativeClustering(n_clusters=k),
        'GMM': GaussianMixture(n_components=k, random_state=42)
    }
    labels = {}
    labels['KMeans']       = algorithms['KMeans'].fit_predict(data)
    labels['DBSCAN']       = algorithms['DBSCAN'].fit_predict(data)
    labels['Agglomerative']= algorithms['Agglomerative'].fit_predict(data)
    algorithms['GMM'].fit(data)
    labels['GMM']          = algorithms['GMM'].predict(data)

    # silhouette only for algorithms without label -1
    scores = {
        alg: silhouette_score(data, lab)
        for alg, lab in labels.items()
        if alg != 'DBSCAN'
    }
    best_alg = max(scores, key=scores.get)
    print(f" → Silhouette scores: {scores}")
    print(f" → Best algorithm: {best_alg}")

    dfc = feats.copy()
    for alg, lab in labels.items():
        dfc[alg] = lab
    dfc['Best_Cluster'] = labels[best_alg]

    # save cluster assignments
    out_assign = os.path.join(PREPROC_DIR, f"{coin_name}_clustering.csv")
    dfc.to_csv(out_assign)
    print(f"Saved assignments to {out_assign}")

    return dfc, best_alg, scores

def analyze_clusters(dfc, best_alg, coin_name):
    """Compute cluster statistics and regimes; save stats & heatmap."""
    grp = dfc.groupby(best_alg)
    stats = grp.agg({
        'Close':['mean','std','min','max'],
        'Volume':['mean','std'],
        'Daily_Return':['mean','std'],
        'Volatility_30':['mean','std'],
        'RSI':['mean','std']
    })
    stats.columns = ['_'.join(col) for col in stats.columns]
    counts = grp.size() / len(dfc) * 100
    stats['Percentage'] = counts

    out_stats = os.path.join(PREPROC_DIR, f"{coin_name}_cluster_stats.csv")
    stats.to_csv(out_stats)
    print(f"Saved cluster stats to {out_stats}")

    # heatmap
    hm_cols = ['Close_mean','Daily_Return_mean','Volatility_30_mean','RSI_mean','Volume_mean','Percentage']
    plt.figure(figsize=(10,6))
    sns.heatmap(stats[hm_cols], annot=True, fmt=".2f", cmap='coolwarm')
    plt.title(f"{coin_name} Cluster Characteristics")
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, f"{coin_name}_cluster_stats.png"))
    plt.close()

    # define regimes
    regimes = {}
    for cl in stats.index:
        mret = stats.loc[cl, 'Daily_Return_mean']
        mrsi = stats.loc[cl, 'RSI_mean']
        if mret > 0.01: regime = "Bull"
        elif mret < -0.01: regime = "Bear"
        else: regime = "Sideways"
        if mrsi > 70: regime += "-OB"
        elif mrsi < 30: regime += "-OS"
        regimes[cl] = regime

    print(f" → Regimes mapping: {regimes}")
    return regimes

def visualize(dfc, data, best_alg, coin_name):
    """PCA scatter, time scatter, market regime plot."""
    # PCA
    pca = PCA(n_components=2)
    pcs = pca.fit_transform(data)
    dfp = pd.DataFrame(pcs, columns=['PC1','PC2'], index=dfc.index)
    dfp['Cluster'] = dfc[best_alg]
    plt.figure(figsize=(8,6))
    plt.scatter(dfp['PC1'], dfp['PC2'], c=dfp['Cluster'], cmap='tab10', alpha=0.6)
    plt.title(f"{coin_name} PCA clusters ({best_alg})")
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, f"{coin_name}_{best_alg}_pca.png"))
    plt.close()

    # time series colored
    plt.figure(figsize=(10,4))
    plt.scatter(dfc.index, dfc['Close'], c=dfc[best_alg], cmap='tab10', s=10)
    plt.title(f"{coin_name} Price over time ({best_alg})")
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, f"{coin_name}_{best_alg}_time.png"))
    plt.close()

    # market regimes
    dfc['Regime'] = dfc[best_alg].map(regimes)
    uniq = dfc['Regime'].unique()
    colors = plt.cm.tab20(np.linspace(0,1,len(uniq)))
    cmap = dict(zip(uniq, colors))
    plt.figure(figsize=(12,5))
    plt.plot(dfc.index, dfc['Close'], color='k', lw=1)
    for reg in uniq:
        sub = dfc[dfc['Regime'] == reg]
        plt.scatter(sub.index, sub['Close'], color=cmap[reg], label=reg, s=8)
    plt.title(f"{coin_name} Market Regimes")
    plt.legend(loc='upper left', ncol=2)
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, f"{coin_name}_market_regimes.png"))
    plt.close()

# ─── main ─────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    clustering_results = {}

    print("Starting clustering analysis...\n")
    for sym, nice in COINS.items():
        print(f"=== {nice} ===")
        df = load_preprocessed(sym)

        data, feats = prepare_clustering_data(df)
        k = determine_optimal_clusters(data)
        dfc, best_alg, scores = perform_clustering(data, k, feats, nice, EPS_MAP[nice])
        regimes = analyze_clusters(dfc, best_alg, nice)
        visualize(dfc, data, best_alg, nice)

        clustering_results[nice] = {
            'Optimal_Clusters': k,
            'Best_Algorithm': best_alg,
            'Silhouette': float(scores[best_alg])
        }
        print("\n")

    # save summary JSON
    with open(CLUSTER_RESULTS_PATH, 'w') as f:
        json.dump(clustering_results, f, indent=2)
    print(f"Saved clustering summary to {CLUSTER_RESULTS_PATH}")

    # summary bar chart
    names = list(clustering_results.keys())
    sils  = [clustering_results[n]['Silhouette'] for n in names]
    plt.figure(figsize=(6,4))
    bars = plt.bar(names, sils, color=['C0','C1','C2'])
    plt.title("Best clustering silhouette by coin")
    plt.ylabel("Silhouette score")
    plt.bar_label(bars, fmt="%.2f")
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, "best_silhouette_comparison.png"))
    plt.close()

    print("\nClustering analysis complete. Visuals in", VIS_DIR)