In [None]:
%pip install pandas numpy matplotlib seaborn scikit-learn statsmodels
%pip install datetime

In [None]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import seasonal_decompose
from datetime import datetime


# ─── Directories ─────────────────────────────────────────────────────────────

PROJECT_DIR    = os.path.expanduser("~/User/crypto proj")
DATA_DIR       = os.path.join(PROJECT_DIR, "data")
PREPROCESS_DIR = os.path.join(DATA_DIR, "preprocessed")
VIS_DIR        = os.path.join(PROJECT_DIR, "visualizations")

os.makedirs(PREPROCESS_DIR, exist_ok=True)
os.makedirs(VIS_DIR, exist_ok=True)

# ───Which coins ─────────────────────────────────────────────────────────────

COINS = {
    'BTC': 'Bitcoin',
    'ETH': 'Ethereum',
    'DOGE': 'Dogecoin'
}

# ───Preprocessing ───────────────────────────────────────────────────────────

def preprocess(symbol, nice_name):
    print(f"Preprocessing {nice_name} ({symbol})…")
    path = os.path.join(DATA_DIR, f"{symbol}.csv")
    df = pd.read_csv(path)

    #ensure exact column names
    df.columns = df.columns.str.strip()
    for col in ['conversionType', 'conversionSymbol']:
        if col in df.columns:
            df.drop(columns=col, inplace=True)

    #parse date
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)

    #numeric columns
    for col in ['Open','High','Low','Close','Volume']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    #drop rows missing Close
    df.dropna(subset=['Close'], inplace=True)

    #rolling features
    df['MA7']  = df['Close'].rolling(7 ).mean()
    df['MA30'] = df['Close'].rolling(30).mean()
    df['MA90'] = df['Close'].rolling(90).mean()

    #momentum
    df['ROC_5']  = df['Close'].pct_change(5)
    df['ROC_10'] = df['Close'].pct_change(10)
    df['ROC_30'] = df['Close'].pct_change(30)

    #volatility
    df['Volatility_7']  = df['Close'].pct_change().rolling(7 ).std()
    df['Volatility_30'] = df['Close'].pct_change().rolling(30).std()

    #RSI
    delta    = df['Close'].diff()
    gain     = delta.clip(lower=0)
    loss     = -delta.clip(upper=0)
    avg_g    = gain.rolling(14).mean()
    avg_l    = loss.rolling(14).mean()
    rs       = avg_g / avg_l
    df['RSI'] = 100 - (100 / (1 + rs))

    #MACD
    e12 = df['Close'].ewm(span=12, adjust=False).mean()
    e26 = df['Close'].ewm(span=26, adjust=False).mean()
    macd       = e12 - e26
    signal     = macd.ewm(span=9, adjust=False).mean()
    df['MACD']        = macd
    df['MACD_Signal'] = signal
    df['MACD_Hist']   = macd - signal

    #targets
    df['Target_Next_Day']   = df['Close'].shift(-1)
    df['Target_Next_Week']  = df['Close'].shift(-7)
    df['Target_Next_Month'] = df['Close'].shift(-30)

    #temporal
    df['Day_of_Week'] = df.index.dayofweek
    df['Month']       = df.index.month
    df['Year']        = df.index.year

    #simple returns
    df['Daily_Return']   = df['Close'].pct_change()
    df['Weekly_Return']  = df['Close'].pct_change(7)
    df['Monthly_Return'] = df['Close'].pct_change(30)

    # drop rows with any of the long-window NaNs
        # drop rows with any of the long-window NaNs
   # df.dropna(subset=['MA90','Target_Next_Month'], inplace=True)
    df.dropna(inplace=True)


    # Standard Scaling (inserted here)
    scaler = StandardScaler()
    scaled_cols = ['Close', 'MA7', 'MACD']
    df_scaled = scaler.fit_transform(df[scaled_cols])
    df[[f"{col}_scaled" for col in scaled_cols]] = df_scaled

    # Save the updated dataframe
    today = datetime.now().strftime('%Y-%m-%d')
    out_csv = os.path.join(PREPROCESS_DIR, f"{symbol}_preprocessed_{today}.csv")
    df.to_csv(out_csv)

    return df

# run for all coins
prepped = {s: preprocess(s, n) for s, n in COINS.items()}

# ─── 4️⃣ Metadata JSON ───────────────────────────────────────────────────────────

info = {
    'Total Records':     {s: df.shape[0] for s, df in prepped.items()},
    'Features Created':  [
        'MA7, MA30, MA90',
        'ROC_5, ROC_10, ROC_30',
        'Volatility_7, Volatility_30',
        'RSI',
        'MACD, MACD_Signal, MACD_Hist',
        'Target_Next_Day, Target_Next_Week, Target_Next_Month',
        'Day_of_Week, Month, Year',
        'Daily_Return, Weekly_Return, Monthly_Return'
    ]
}

with open(os.path.join(PREPROCESS_DIR, "preprocessing_info.json"), 'w') as f:
    json.dump(info, f, indent=2)
print("Wrote preprocessing_info.json")

# ───Visualizations ──────────────────────────────────────────────────────────

for symbol, nice_name in COINS.items():
    df = prepped[symbol]
    nums = df.select_dtypes(include=[np.number])

    # a) Correlation heatmap
    plt.figure(figsize=(12,10))
    sns.heatmap(nums.corr(), cmap='coolwarm', linewidths=0.4)
    plt.title(f"{nice_name} — Feature Correlation")
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, f"{symbol}_correlation.png"))
    plt.close()

    # b) Top-15 with Next-Day target
    fc = nums.corr()['Target_Next_Day'].drop(
        ['Target_Next_Day','Target_Next_Week','Target_Next_Month'], errors='ignore'
    )
    plt.figure(figsize=(10,6))
    fc.abs().sort_values(ascending=False).head(15).plot(kind='bar')
    plt.ylabel("Abs Correlation")
    plt.title(f"{nice_name} — Top 15 Features vs Next-Day Close")
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, f"{symbol}_feature_importance.png"))
    plt.close()

    # c) Seasonal decomposition
    try:
        dec = seasonal_decompose(df['Close'], model='multiplicative', period=365)
        fig, axes = plt.subplots(4,1, figsize=(10,12))
        dec.observed .plot(ax=axes[0], legend=False); axes[0].set_title("Observed")
        dec.trend    .plot(ax=axes[1], legend=False); axes[1].set_title("Trend")
        dec.seasonal.plot(ax=axes[2], legend=False); axes[2].set_title("Seasonal")
        dec.resid    .plot(ax=axes[3], legend=False); axes[3].set_title("Residual")
        plt.tight_layout()
        plt.savefig(os.path.join(VIS_DIR, f"{symbol}_decomposition.png"))
        plt.close()
    except Exception as e:
        print(f"Skipping decomposition for {symbol}: {e}")

print("Data preprocessing & visualizations complete!")
