In [None]:
%pip install numpy pandas matplotlib seaborn scikit-learn statsmodels joblib
%pip install datetime

In [None]:
import os
import glob
import json
import warnings
import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.exceptions import ConvergenceWarning

# ─── suppress warnings ─────────────────────────────────────────────────────────
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# ─── directories ───────────────────────────────────────────────────────────────
PROJECT_DIR    = os.path.expanduser("~/User/crypto proj")
PREPROCESS_DIR = os.path.join(PROJECT_DIR, "data", "preprocessed")
INFO_PATH      = os.path.join(PREPROCESS_DIR, "preprocessing_info.json")
RESULTS_DIR    = os.path.join(PROJECT_DIR, "regression_results")
MODEL_DIR      = os.path.join(PROJECT_DIR, "models")
PLOTS_DIR      = os.path.join(PROJECT_DIR, "visualizations", "regression")

for d in (PREPROCESS_DIR, RESULTS_DIR, MODEL_DIR, PLOTS_DIR):
    os.makedirs(d, exist_ok=True)

# ─── constants ────────────────────────────────────────────────────────────────
TARGET       = "Target_Next_Day"
TEST_SIZE    = 0.2
RANDOM_STATE = 42

# ─── load symbol list ───────────────────────────────────────────────────────────
with open(INFO_PATH, 'r') as f:
    info    = json.load(f)
SYMBOLS = list(info['Total Records'].keys())  # e.g. ['BTC','ETH','DOGE']

# ─── helper functions ──────────────────────────────────────────────────────────
def load_latest(symbol: str) -> pd.DataFrame:
    """Load the most recent preprocessed CSV for a symbol."""
    pattern = os.path.join(PREPROCESS_DIR, f"{symbol}_preprocessed_*.csv")
    paths   = glob.glob(pattern)
    if not paths:
        raise FileNotFoundError(f"No preprocessed files found for {symbol}")
    latest = max(paths, key=os.path.getmtime)
    print(f"Loaded {os.path.basename(latest)} for {symbol}")
    df = pd.read_csv(latest, parse_dates=["Date"], index_col="Date")
    return df

def prepare_data(df: pd.DataFrame):
    """Return chronological train/test split of features and target."""
    X = df.select_dtypes(include=[np.number]).drop(
        ["Target_Next_Day","Target_Next_Week","Target_Next_Month"], axis=1, errors="ignore"
    ).dropna()
    y = df.loc[X.index, TARGET]
    return train_test_split(X, y, test_size=TEST_SIZE, shuffle=False)

def evaluate_metrics(y_true, y_pred):
    """Compute RMSE, MAE, R2."""
    return {
        "RMSE": mean_squared_error(y_true,   y_pred, squared=False),
        "MAE":  mean_absolute_error(    y_true,   y_pred),
        "R2":   r2_score(              y_true,   y_pred)
    }

def plot_bar(series, title, out_path):
    """Bar plot with value labels."""
    ax = series.sort_values().plot.bar(title=title, figsize=(8,4))
    ax.bar_label(ax.containers[0], fmt="%.2f")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def plot_actual_vs_pred(y_true, y_pred, symbol, model_name):
    """Time-series plot of actual vs predicted."""
    plt.figure(figsize=(10,4))
    plt.plot(y_true.index, y_true, label="Actual")
    plt.plot(y_true.index, y_pred, label="Predicted", alpha=0.7)
    plt.title(f"{symbol} — {model_name}")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, f"{symbol}_{model_name}_pred.png"))
    plt.close()

def save_feature_importance(model, features, symbol, model_name):
    """If model has feature_importances_, save them to CSV."""
    if hasattr(model, "feature_importances_"):
        imp = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
        imp.to_csv(os.path.join(RESULTS_DIR, f"{symbol}_{model_name}_feat_imp.csv"))

def fit_and_evaluate(name, estimator, param_grid, X_tr, y_tr, X_te, y_te):
    """
    Fit estimator (with optional grid search), evaluate on test set,
    return fitted model, metrics dict, and y_pred.
    """
    if param_grid:
        gs = GridSearchCV(estimator, param_grid, cv=5, n_jobs=-1, scoring="r2")
        gs.fit(X_tr, y_tr)
        model = gs.best_estimator_
        print(f"  {name} best params: {gs.best_params_}")
    else:
        model = estimator.fit(X_tr, y_tr)

    y_pred  = model.predict(X_te)
    metrics = evaluate_metrics(y_te, y_pred)
    return model, metrics, y_pred

# ─── model configurations ──────────────────────────────────────────────────────
scaler_step = ("scaler", StandardScaler())
MODELS = {
    "Linear":     (Pipeline([scaler_step, ("model", LinearRegression())]), {}),
    "Ridge":      (Pipeline([scaler_step, ("model", Ridge(max_iter=10000))]),
                   {"model__alpha":[0.001,0.01,0.1,1,10]}),
    "Lasso":      (Pipeline([scaler_step, ("model", Lasso(max_iter=50000, tol=1e-3))]),
                   {"model__alpha":[0.001,0.01,0.1,1]}),
    "ElasticNet": (Pipeline([scaler_step, ("model", ElasticNet(max_iter=50000, tol=1e-3))]),
                   {"model__alpha":[0.001,0.01,0.1], "model__l1_ratio":[0.2,0.5,0.8]}),
    "SVR":        (Pipeline([scaler_step, ("model", SVR())]),
                   {"model__C":[0.1,1,10], "model__epsilon":[0.01,0.1,1]}),
    "GBR":        (GradientBoostingRegressor(random_state=RANDOM_STATE), {}),
    "RF":         (RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE), {})
}

# ─── main routine ──────────────────────────────────────────────────────────────
def main():
    summary = []

    print("Starting regression pipeline...\n")
    for sym in SYMBOLS:
        print(f"=== Processing {sym} ===")
        df = load_latest(sym)
        X_tr, X_te, y_tr, y_te = prepare_data(df)
        print(f"Data split: X_train={X_tr.shape}, X_test={X_te.shape}\n")

        metrics_dict = {}
        for name, (est, params) in MODELS.items():
            print(f"-->> Training {name}")
            model, mets, y_pred = fit_and_evaluate(name, est, params, X_tr, y_tr, X_te, y_te)
            print(f"   {name} metrics: R2={mets['R2']:.4f}, RMSE={mets['RMSE']:.2f}, MAE={mets['MAE']:.2f}\n")
            metrics_dict[name] = mets

            # persist model and outputs
            joblib.dump(model, os.path.join(MODEL_DIR, f"{sym}_{name}.pkl"))
            plot_actual_vs_pred(y_te, y_pred, sym, name)
            save_feature_importance(model, X_tr.columns, sym, name)

        # metrics DataFrame
        dfm = pd.DataFrame(metrics_dict).T[["RMSE","MAE","R2"]]
        dfm.to_csv(os.path.join(RESULTS_DIR, f"{sym}_metrics.csv"))

        # plot metrics
        plot_bar(dfm["R2"],  f"{sym} test R²",  os.path.join(PLOTS_DIR, f"{sym}_r2.png"))
        plot_bar(dfm["RMSE"], f"{sym} test RMSE", os.path.join(PLOTS_DIR, f"{sym}_rmse.png"))

        # print comparison & best model
        print("Model comparison:")
        for m, row in dfm.iterrows():
            print(f"  {m:10s} -> R2: {row['R2']:.4f}, RMSE: {row['RMSE']:.2f}, MAE: {row['MAE']:.2f}")
        best = dfm["R2"].idxmax()
        best_stats = dfm.loc[best]
        print(f"\nBest model for {sym}: {best} (R2={best_stats['R2']:.4f}, RMSE={best_stats['RMSE']:.2f}, MAE={best_stats['MAE']:.2f})\n")

        summary.append({
            "Crypto": sym,
            "Model": best,
            **best_stats.to_dict()
        })

    # consolidated summary CSV
    summary_df = pd.DataFrame(summary)
    summary_path = os.path.join(RESULTS_DIR, "regression_summary.csv")
    summary_df.to_csv(summary_path, index=False)

    # summary of best models
    print("Summary of best models:")
    for _, row in summary_df.iterrows():
        print(f"{row['Crypto']} -> {row['Model']} (R2={row['R2']:.4f}, RMSE={row['RMSE']:.2f}, MAE={row['MAE']:.2f})")

    # best-model comparison plot
    fig, ax = plt.subplots(figsize=(6,4))
    bars = ax.bar(summary_df["Crypto"], summary_df["R2"])
    ax.set_title("Best R² by Cryptocurrency")
    ax.bar_label(bars, fmt="%.2f")
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, "best_model_comparison.png"))
    plt.close()

    print(f"\ndone!\nModels saved in:  {MODEL_DIR}\nMetrics saved in: {RESULTS_DIR}\nPlots saved in:   {PLOTS_DIR}")

if __name__ == "__main__":
    main()
