In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
"""
train_lstm_crypto_colab.py

Trains per-coin LSTM models using CSVs stored in Google Drive.

Folder structure in Drive:
MyDrive/infosys/
    daily/   -> daily CSVs (per-coin)
    hourly/  -> hourly CSVs (per-coin)
    outputs/ -> results saved here
"""

import argparse
import os
import sys
import glob
import re
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from joblib import dump

import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("GPU Details:", tf.config.list_physical_devices('GPU'))
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# -----------------------------
# Helpers
# -----------------------------

TIMESTAMP_CANDIDATES = ["timestamp","date","datetime","time"]
TARGET_CANDIDATES    = ["close","adj close","price","close_price","closing price","close*"]

def find_first_column(df: pd.DataFrame, candidates):
    cols_lower = {c.lower(): c for c in df.columns}
    for name in candidates:
        if name in cols_lower:
            return cols_lower[name]
    for name in candidates:
        base = name.replace("*","").strip()
        for c in df.columns:
            if c.lower().startswith(base):
                return c
    return None

def coin_from_filename(path: str) -> str:
    name = os.path.splitext(os.path.basename(path))[0]
    name = re.sub(r'(_daily|_hourly)$', "", name, flags=re.IGNORECASE)
    return name.upper()

def read_series(path: str, timestamp_col=None, target_col=None):
    df = pd.read_csv(path)
    if timestamp_col is None:
        timestamp_col = find_first_column(df, TIMESTAMP_CANDIDATES)
    if target_col is None:
        target_col = find_first_column(df, TARGET_CANDIDATES)
    if timestamp_col is None or target_col is None:
        raise ValueError(f"Could not auto-detect timestamp/target in {path}. Columns={list(df.columns)}")
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce', infer_datetime_format=True)
    df = df.dropna(subset=[timestamp_col])
    df = df.sort_values(timestamp_col).reset_index(drop=True)
    s = df[target_col].astype(float).values.reshape(-1,1)
    ts = df[timestamp_col].values
    return ts, s, timestamp_col, target_col

def make_windows(series: np.ndarray, window: int, horizon: int):
    X, y = [], []
    for i in range(len(series) - window - horizon + 1):
        X.append(series[i:i+window])
        y.append(series[i+window:i+window+horizon].ravel())
    return np.array(X), np.array(y)

def train_val_test_split(X, y, val_size=0.15, test_size=0.15):
    n = len(X)
    n_test = int(n * test_size)
    n_val  = int((n - n_test) * val_size)
    train_end = n - n_test - n_val
    val_end   = n - n_test
    return X[:train_end], y[:train_end], X[train_end:val_end], y[train_end:val_end], X[val_end:], y[val_end:]

def build_model(window: int, features: int, horizon: int, lr: float = 1e-3):
    model = Sequential([
        Input(shape=(window, features)),
        LSTM(64, return_sequences=True),
        Dropout(0.2),
        LSTM(32),
        Dense(horizon)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss="mse")
    return model

def metrics_dict(y_true, y_pred, prefix=""):
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    if y_true.size == 0:
        return {f"{prefix}RMSE": float("nan"),
                f"{prefix}MAE": float("nan"),
                f"{prefix}MAPE": float("nan")}
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = float(np.sqrt(mse))
    mae = float(np.mean(np.abs(y_true - y_pred)))
    with np.errstate(divide="ignore", invalid="ignore"):
        denom = np.clip(np.abs(y_true), 1e-8, None)
        mape = float(np.mean(np.abs((y_true - y_pred) / denom)) * 100.0)
    return {f"{prefix}RMSE": rmse,
            f"{prefix}MAE": mae,
            f"{prefix}MAPE": mape}

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

# -----------------------------
# Training routine
# -----------------------------

def train_for_folder(data_dir: str, freq: str, args):
    if not os.path.isdir(data_dir):
        print(f"[{freq}] Directory not found: {data_dir}")
        return []

    files = sorted(glob.glob(os.path.join(data_dir, args.pattern)))
    if not files:
        print(f"[{freq}] No CSV files found in {data_dir}")
        return []

    out_model_dir = os.path.join(args.output_dir, "models", freq)
    out_scaler_dir = os.path.join(args.output_dir, "scalers", freq)
    out_pred_dir = os.path.join(args.output_dir, "predictions", freq)
    ensure_dir(out_model_dir)
    ensure_dir(out_scaler_dir)
    ensure_dir(out_pred_dir)

    rows = []
    for f in files:
        coin = coin_from_filename(f)
        print(f"[{freq}] Training {coin}...")

        try:
            ts, s, tcol, ycol = read_series(f, args.timestamp_col, args.target_col)
        except Exception as e:
            print(f"[{freq}] {coin}: ERROR reading file -> {e}")
            continue

        scaler = MinMaxScaler()
        s_scaled = scaler.fit_transform(s)

        X, y = make_windows(s_scaled, args.window, args.horizon)
        if len(X) < 10:
            print(f"[{freq}] {coin}: not enough data. Skipping.")
            continue

        X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y, args.val_size, args.test_size)

        tf.keras.backend.clear_session()
        model = build_model(args.window, X.shape[-1], args.horizon, lr=args.learning_rate)

        ckpt_path = os.path.join(out_model_dir, f"{coin}.keras")

        # --- NEW: check if a saved model already exists ---
        if os.path.exists(ckpt_path):
            print(f"[{freq}] {coin}: Resuming training from saved checkpoint...")
            model = tf.keras.models.load_model(ckpt_path)
        else:
            print(f"[{freq}] {coin}: Starting new training...")
            model = build_model(args.window, X.shape[-1], args.horizon, lr=args.learning_rate)

        callbacks = [
            EarlyStopping(monitor="val_loss", patience=args.patience, restore_best_weights=True),
            ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=max(2, args.patience//2), verbose=1),
            ModelCheckpoint(ckpt_path, monitor="val_loss", save_best_only=True, save_freq="epoch")
        ]

        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=args.epochs,
            batch_size=args.batch_size,
            verbose=1,
            callbacks=callbacks
        )

        def inv(pred):
            return np.array([scaler.inverse_transform(row.reshape(-1,1)).ravel() for row in pred])

        y_pred_val = inv(model.predict(X_val, verbose=0))
        y_pred_test = inv(model.predict(X_test, verbose=0))
        y_val_inv, y_test_inv = inv(y_val), inv(y_test)

        m_val  = metrics_dict(y_val_inv[:,0], y_pred_val[:,0], prefix="val_")
        m_test = metrics_dict(y_test_inv[:,0], y_pred_test[:,0], prefix="test_")

        row = {"freq": freq, "coin": coin, "window": args.window, "horizon": args.horizon}
        row.update(m_val)
        row.update(m_test)
        rows.append(row)

        dump(scaler, os.path.join(out_scaler_dir, f"{coin}.joblib"))
        pred_df = pd.DataFrame({
            "set": ["val"]*len(y_val_inv) + ["test"]*len(y_test_inv),
            "y_true": np.concatenate([y_val_inv[:,0], y_test_inv[:,0]]),
            "y_pred": np.concatenate([y_pred_val[:,0], y_pred_test[:,0]])
        })
        pred_df.to_csv(os.path.join(out_pred_dir, f"{coin}.csv"), index=False)

        print(f"[{freq}] {coin}: done. val_RMSE={m_val['val_RMSE']:.4f} test_RMSE={m_test['test_RMSE']:.4f}")

    if rows:
        metrics_dir = os.path.join(args.output_dir, "metrics", freq)
        ensure_dir(metrics_dir)
        pd.DataFrame(rows).to_csv(os.path.join(metrics_dir, "metrics.csv"), index=False)
        print(f"[{freq}] Metrics saved -> {metrics_dir}/metrics.csv")

# -----------------------------
# Main
# -----------------------------

def parse_args():
    p = argparse.ArgumentParser()
    # In Colab: default base = /content/drive/MyDrive/infosys
    base = "/content/drive/MyDrive/infosys"
    p.add_argument("--daily_dir", type=str, default=os.path.join(base, "daily"))
    p.add_argument("--hourly_dir", type=str, default=os.path.join(base, "hourly"))
    p.add_argument("--output_dir", type=str, default=os.path.join(base, "outputs"))
    p.add_argument("--pattern", type=str, default="*.csv")
    p.add_argument("--timestamp_col", type=str, default=None)
    p.add_argument("--target_col", type=str, default=None)
    p.add_argument("--window", type=int, default=60)
    p.add_argument("--horizon", type=int, default=1)
    p.add_argument("--val_size", type=float, default=0.15)
    p.add_argument("--test_size", type=float, default=0.15)
    p.add_argument("--epochs", type=int, default=50)
    p.add_argument("--batch_size", type=int, default=64)
    p.add_argument("--learning_rate", type=float, default=1e-3)
    p.add_argument("--patience", type=int, default=5)
    return p.parse_args(args=[])

def main():
    args = parse_args()
    ensure_dir(args.output_dir)

    print("=== LSTM Crypto Training (Colab) ===")
    print(f"daily_dir  : {args.daily_dir}")
    print(f"hourly_dir : {args.hourly_dir}")
    print(f"output_dir : {args.output_dir}")

    train_for_folder(args.daily_dir, "daily", args)
    train_for_folder(args.hourly_dir, "hourly", args)

    print("Done.")

if __name__ == "__main__":
    main()


Num GPUs Available: 1
GPU Details: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
=== LSTM Crypto Training (Colab) ===
daily_dir  : /content/drive/MyDrive/infosys/daily
hourly_dir : /content/drive/MyDrive/infosys/hourly
output_dir : /content/drive/MyDrive/infosys/outputs
[daily] Training ALL_CRYPTO_INR_HOURLY_MERGED...
[daily] ALL_CRYPTO_INR_HOURLY_MERGED: Resuming training from saved checkpoint...
Epoch 1/50
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step - loss: 0.0017 - val_loss: 0.0056 - learning_rate: 3.1250e-05
Epoch 2/50
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - loss: 0.0017 - val_loss: 0.0056 - learning_rate: 3.1250e-05
Epoch 3/50
[1m354/357[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - loss: 0.0017
Epoch 3: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.0017 - val_loss: 0