In [None]:
# Cell 1: Install all dependencies
!pip install -q yfinance statsmodels xgboost optuna nltk seaborn requests autogluon.tabular
!python -m nltk.downloader -q vader_lexicon punkt stopwords


In [None]:
# Cell 2: Imports & Utility Functions
import os, re, requests
from datetime import datetime
import numpy as np, pandas as pd, yfinance as yf
import matplotlib.pyplot as plt, seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error,
    accuracy_score, f1_score, roc_auc_score
)
from statsmodels.tsa.arima.model import ARIMA
from xgboost import XGBRegressor
from autogluon.tabular import TabularPredictor
import optuna
import torch, torch.nn as nn
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader, TensorDataset

sns.set(style="whitegrid")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


In [None]:
# Cell 3: Data‐Collection Functions & Caching
def load_or_fetch_csv(fname, fetch_func, parse_dates=None, force_fetch=False, **kw):
    if os.path.exists(fname) and not force_fetch:
        return pd.read_csv(fname, parse_dates=parse_dates)
    df = fetch_func(**kw)
    if not df.empty:
        df.to_csv(fname, index=False)
    return df

def get_cc_data(fsym="BTC", tsym="USD", limit=2000, api_key=None):
    url="https://min-api.cryptocompare.com/data/v2/histoday"
    params={"fsym":fsym,"tsym":tsym,"limit":limit}
    hdr={}
    if api_key: hdr["authorization"]=f"Apikey {api_key}"
    r=requests.get(url,params=params,headers=hdr)
    js=r.json().get("Data",{}).get("Data",[])
    df=pd.DataFrame(js)
    df['date']=pd.to_datetime(df.time,unit='s')
    df.rename(columns={'close':'price','volumeto':'volume'},inplace=True)
    return df[['date','price','volume']]

def get_cp_news(api_key, currency="BTC"):
    url="https://cryptopanic.com/api/v1/posts/"
    params={"auth_token":api_key,"public":"true","currencies":currency}
    r=requests.get(url,params=params)
    items=r.json().get("results",[])
    out=[]
    for i in items:
        out.append({
            'date':pd.to_datetime(i['published_at']).date(),
            'title':i['title']
        })
    return pd.DataFrame(out)

def get_guardian(api_key, max_pages=5):
    items=[]
    for page in range(1, max_pages+1):
        params={"api-key":api_key,"section":"us-news","page":page,"page-size":50,"show-fields":"headline"}
        r=requests.get("https://content.guardianapis.com/search",params=params)
        res=r.json().get("response",{}).get("results",[])
        if not res: break
        for it in res:
            date=it.get("webPublicationDate","")[:10]
            items.append({'date':pd.to_datetime(date), 'title':it['fields']['headline']})
    return pd.DataFrame(items)

def get_world_bank(country="USA", indicator="FP.CPI.TOTL.ZG", date_range="2010:2024"):
    url=f"http://api.worldbank.org/v2/country/{country}/indicator/{indicator}"
    params={"format":"json","date":date_range,"per_page":200}
    r=requests.get(url,params=params)
    data=r.json()
    if len(data)>1:
        df=pd.DataFrame(data[1])
        df['date']=pd.to_datetime(df['date'],format='%Y')
        return df[['date','value']]
    return pd.DataFrame()

# API keys from Colab userdata
from google.colab import userdata
CC_KEY = userdata.get('CC_KEY'); CP_KEY=userdata.get('CP_KEY'); GN_KEY=userdata.get('GN_KEY')

# Fetch & cache
market_df      = load_or_fetch_csv("market.csv", get_cc_data, parse_dates=["date"], api_key=CC_KEY)
cp_df          = load_or_fetch_csv("cryptopanic.csv", get_cp_news,               api_key=CP_KEY, force_fetch=True)
guardian_df    = load_or_fetch_csv("guardian.csv",   get_guardian,   parse_dates=["date"], api_key=GN_KEY)
inflation_df   = load_or_fetch_csv("inflation.csv",  get_world_bank)

print("Market:", market_df.shape, "CryptoPanic:", cp_df.shape,
      "Guardian:", guardian_df.shape, "Inflation:", inflation_df.shape)


In [None]:
# Cell 4: Preprocessing & Sentiment
def preprocess_market(df):
    df=df.sort_values("date").set_index("date").asfreq("D").ffill().reset_index()
    return df

def preprocess_macro(df):
    df=df.dropna().sort_values("date").set_index("date").asfreq("YS").ffill().reset_index()
    return df

market_clean   = preprocess_market(market_df)
inflation_clean= preprocess_macro(inflation_df)

sia=SentimentIntensityAnalyzer()
cp_df['sentiment']      = cp_df['title'].apply(lambda t: sia.polarity_scores(t)['compound'])
guardian_df['sentiment']= guardian_df['title'].apply(lambda t: sia.polarity_scores(t)['compound'])

print("Cleaned & Sentimentized")


In [None]:
# Cell 5: Exploratory Analysis
# 5.1 EMA & Seasonality
market_clean['EMA90']=market_clean['price'].ewm(span=90).mean()
fig,ax=plt.subplots(1,2,figsize=(12,4))
ax[0].plot(market_clean.date,market_clean.price,label='Price')
ax[0].plot(market_clean.date,market_clean.EMA90,label='EMA90'); ax[0].legend(); ax[0].set_title("Price vs EMA")
res=seasonal_decompose(market_clean.set_index("date")['price'], model='additive', period=365)
res.plot(); plt.suptitle("Seasonality"); plt.tight_layout()

# 5.2 RSI & Volatility
delta=market_clean.price.diff()
gain=delta.clip(lower=0); loss=-delta.clip(upper=0)
avg_gain=gain.rolling(14).mean(); avg_loss=loss.rolling(14).mean()
rs=avg_gain/avg_loss; market_clean['RSI']=100-(100/(1+rs))
market_clean['vol7']=delta.rolling(7).std()
plt.figure(figsize=(6,3)); plt.plot(market_clean.date,market_clean.RSI); plt.title("RSI")

# 5.3 News vs Price
daily_guardian=guardian_df.groupby('date')['sentiment'].mean().reset_index()
mrg=pd.merge(market_clean, daily_guardian, on='date', how='inner')
plt.figure(figsize=(6,3))
plt.plot(mrg.date, mrg.price, label='Price')
plt.plot(mrg.date, mrg.sentiment, label='Sentiment')
plt.legend(); plt.title("Price vs Guardian Sentiment")


In [None]:
# Cell 6: Feature Engineering for Modeling
# combine signals into one DF
df = market_clean[['date','price','volume','EMA90','RSI','vol7']].copy()
df = df.rename(columns={'price':'Close'})
# merge latest daily sentiment & on-chain stubs
sent = guardian_df.groupby('date')['sentiment'].mean()
df['Sentiment'] = df['date'].map(sent).fillna(0)
# stub blockchain metrics
np.random.seed(0)
df['TxCount'] = np.random.randint(10000,30000,len(df))
df['ActiveAddresses'] = np.random.randint(5000,20000,len(df))
# targets & lags
df['Return']=df.Close.pct_change()
df['Lag1']=df.Return.shift(1)
df['Lag7']=df.Return.shift(7)
df['Volatility7']=df.Return.rolling(7).std()
df['Direction']=(df.Return>0).astype(int)
df['VolNext']=df.Volatility7.shift(-1)
df.dropna(inplace=True)
df = df.set_index('date')
features = ['Close','volume','EMA90','RSI','vol7','Sentiment','TxCount','ActiveAddresses','Lag1','Lag7','Volatility7']


In [None]:
# Cell 7: Auto‑ARIMA via Optuna
ts = df['Close']
ts = ts.asfreq('D')
def arima_obj(trial):
    p=trial.suggest_int("p",0,5); d=trial.suggest_int("d",0,2); q=trial.suggest_int("q",0,5)
    try:
        m=ARIMA(ts,order=(p,d,q),trend='t').fit()
        return m.aic
    except:
        return 1e10

study=optuna.create_study(direction='minimize'); study.optimize(arima_obj,n_trials=25)
best=study.best_params; order=(best['p'],best['d'],best['q'])
print("Best order:",order)
model_arima=ARIMA(ts,order=order,trend='t').fit()
fc=model_arima.get_forecast(30)
pred_arima, ci = fc.predicted_mean, fc.conf_int()
arima_mae = mean_absolute_error(df.Close[-30:], pred_arima)
arima_rmse= mean_squared_error(df.Close[-30:], pred_arima, squared=False)


In [None]:
# Cell 8: XGBoost w/ Rolling CV + Optuna HPO
# prepare X,y
X = df[features].values; y=df['Close'].shift(-1).dropna().values
X=X[:-1]; # align
tscv=TimeSeriesSplit(n_splits=4)

# Optuna tuning
def xgb_obj(tr):
    params={
      'n_estimators': tr.suggest_int('n_estimators',50,300),
      'max_depth': tr.suggest_int('max_depth',3,12),
      'learning_rate':tr.suggest_loguniform('learning_rate',1e-3,1e-1),
      'subsample':tr.suggest_uniform('subsample',0.6,1),
      'colsample_bytree':tr.suggest_uniform('colsample_bytree',0.6,1),
      'tree_method':"gpu_hist" if DEVICE.type=='cuda' else 'hist'
    }
    rmses=[]
    for tr_idx,val_idx in tscv.split(X):
        m=XGBRegressor(**params).fit(X[tr_idx],y[tr_idx])
        p=m.predict(X[val_idx]); rmses.append(mean_squared_error(y[val_idx],p,squared=False))
    return np.mean(rmses)

st2=optuna.create_study(direction='minimize'); st2.optimize(xgb_obj, n_trials=20)
xgb_params=st2.best_params; xgb_params['tree_method']="gpu_hist" if DEVICE.type=='cuda' else 'hist'
# final eval
split=int(0.8*len(X))
xgb_final=XGBRegressor(**xgb_params).fit(X[:split],y[:split])
pred_xgb=xgb_final.predict(X[split:])
xgb_opt_mae=mean_absolute_error(y[split:],pred_xgb)
xgb_opt_rmse=mean_squared_error(y[split:],pred_xgb,squared=False)


In [None]:
# Cell 9 (improved): LSTM with target‑scaling
import numpy as np
import torch, torch.nn as nn
from torch.cuda.amp        import autocast, GradScaler
from torch.utils.data      import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics       import mean_absolute_error, mean_squared_error, accuracy_score, f1_score, roc_auc_score

# —— 1) Scale features + target ——
feat_scaler = StandardScaler()
X_all = feat_scaler.fit_transform(df[features].values)

tgt_scaler = StandardScaler()
y_all = df["Close"].shift(-1).dropna().values.reshape(-1,1)
y_scaled = tgt_scaler.fit_transform(y_all).flatten()

# align lengths
X_all = X_all[:-1]  # drop last row so X & y match

# —— 2) Create sequences ——
def mk_seq(arr: np.ndarray, tgt: np.ndarray, L: int = 20):
    Xs, ys = [], []
    for i in range(len(arr) - L):
        Xs.append(arr[i : i + L])
        ys.append(tgt[i + L])
    return np.array(Xs), np.array(ys)

Xseq, Yseq = mk_seq(X_all, y_scaled, L=20)

# —— 3) Train/test split ——
n = len(Xseq)
s = int(0.8 * n)
Xtr, Xte = Xseq[:s], Xseq[s:]
ytr, yte = Yseq[:s], Yseq[s:]

# to tensors
Xt = torch.tensor(Xtr, dtype=torch.float32).to(DEVICE)
yt = torch.tensor(ytr, dtype=torch.float32).view(-1,1).to(DEVICE)
Xv = torch.tensor(Xte, dtype=torch.float32).to(DEVICE)
yv = torch.tensor(yte, dtype=torch.float32).view(-1,1).to(DEVICE)

# —— 4) Define model ——
class LSTMReg(nn.Module):
    def __init__(self, n_feats: int, hidden: int = 64):
        super().__init__()
        self.lstm = nn.LSTM(n_feats, hidden, batch_first=True)
        self.fc   = nn.Linear(hidden, 1)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1])

lstm = LSTMReg(Xtr.shape[2]).to(DEVICE)
opt   = torch.optim.Adam(lstm.parameters(), lr=1e-3)
scaler_amp = GradScaler()

# —— 5) Train ——
loader = DataLoader(TensorDataset(Xt, yt), batch_size=64, shuffle=False)
for ep in range(15):
    lstm.train()
    total_loss = 0.0
    for xb, yb in loader:
        opt.zero_grad()
        with autocast():
            out   = lstm(xb)
            loss  = nn.MSELoss()(out, yb)
        scaler_amp.scale(loss).backward()
        scaler_amp.step(opt)
        scaler_amp.update()
        total_loss += loss.item() * xb.size(0)
    print(f"Epoch {ep+1:02d}, train MSE loss = {total_loss/len(loader.dataset):.4f}")

# —— 6) Predict & inverse‑scale ——
lstm.eval()
with torch.no_grad():
    pred_scaled = lstm(Xv).cpu().numpy().flatten()

# back to USD
pred_lstm = tgt_scaler.inverse_transform(pred_scaled.reshape(-1,1)).flatten()
actual_lstm = tgt_scaler.inverse_transform(yte.reshape(-1,1)).flatten()

# —— 7) Metrics ——
lstm_mae  = mean_absolute_error(actual_lstm, pred_lstm)
lstm_rmse = mean_squared_error(actual_lstm, pred_lstm, squared=False)
print(f"LSTM Price → MAE: {lstm_mae:.2f}, RMSE: {lstm_rmse:.2f}")

# direction metrics (aligned)
n2 = min(len(actual_lstm), len(pred_lstm))
dp = (pred_lstm[:n2][1:] > pred_lstm[:n2][:-1]).astype(int)
da = (actual_lstm[:n2][1:] > actual_lstm[:n2][:-1]).astype(int)
print("LSTM Direction Acc:", accuracy_score(da, dp))
print("LSTM Direction F1: ",    f1_score(da, dp))
print("LSTM Direction ROC:",    roc_auc_score(da, dp))


In [None]:
# Cell 10 (fast‐train + GPU): AutoGluon Baseline
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, f1_score

# avoid duplicating "Close"
ag_feats = [f for f in features if f != "Close"]

# assemble DataFrame
ag_df = df[ag_feats + ["Close", "Direction", "VolNext"]].copy()
ag_df["PriceT"] = ag_df["Close"].shift(-1)
ag_df["DirT"]   = ag_df["Direction"].shift(-1)
ag_df["VolT"]   = ag_df["VolNext"]
ag_df.dropna(inplace=True)
ag_df.reset_index(drop=True, inplace=True)

# train/test split
cut = int(0.8 * len(ag_df))
train_ag, test_ag = ag_df.iloc[:cut], ag_df.iloc[cut:]

# common .fit kwargs for speed + GPU
fit_kwargs = dict(
    time_limit=120,                   # max seconds
    presets='medium_quality_faster_train',  # faster preset
    ag_args_fit={'num_gpus': 1}       # force GPU use
)

# 1) Price regression
predictor_price = TabularPredictor(
    label="PriceT", problem_type="regression",
    eval_metric="mean_absolute_error"
).fit(
    train_data=train_ag[ag_feats + ["PriceT"]],
    **fit_kwargs
)
pred_price_ag = predictor_price.predict(test_ag[ag_feats])
ag_price_mae  = mean_absolute_error(test_ag["PriceT"], pred_price_ag)
ag_price_rmse = mean_squared_error(test_ag["PriceT"], pred_price_ag, squared=False)
print(f"AutoGluon Price → MAE: {ag_price_mae:.2f}, RMSE: {ag_price_rmse:.2f}")

# 2) Direction classification
predictor_dir = TabularPredictor(
    label="DirT", problem_type="binary",
    eval_metric="accuracy"
).fit(
    train_data=train_ag[ag_feats + ["DirT"]],
    **fit_kwargs
)
pred_dir_ag = predictor_dir.predict(test_ag[ag_feats])
ag_dir_acc  = accuracy_score(test_ag["DirT"], pred_dir_ag)
ag_dir_f1   = f1_score(test_ag["DirT"], pred_dir_ag)
print(f"AutoGluon Direction → Acc: {ag_dir_acc:.2f}, F1: {ag_dir_f1:.2f}")

# 3) Volatility regression
predictor_vol = TabularPredictor(
    label="VolT", problem_type="regression",
    eval_metric="mean_absolute_error"
).fit(
    train_data=train_ag[ag_feats + ["VolT"]],
    **fit_kwargs
)
pred_vol_ag = predictor_vol.predict(test_ag[ag_feats])
ag_vol_mae  = mean_absolute_error(test_ag["VolT"], pred_vol_ag)
ag_vol_rmse = mean_squared_error(test_ag["VolT"], pred_vol_ag, squared=False)
print(f"AutoGluon Volatility → MAE: {ag_vol_mae:.4f}, RMSE: {ag_vol_rmse:.4f}")


In [None]:
# Cell 11 (fixed): Weighted Ensemble + AutoGluon Stacking (with proper alignment)
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# — 1) Weighted average of XGB + LSTM —

# weights inversely proportional to RMSE
w1, w2 = 1 / xgb_opt_rmse, 1 / lstm_rmse

# true values for XGB come from y[split:], preds from pred_xgb
y_true_xgb = y[split:]

# find minimum length across the three series
n_ens = min(len(y_true_xgb), len(pred_xgb), len(pred_lstm))

# crop them all
y_true_crop  = y_true_xgb[:n_ens]
px           = pred_xgb[:n_ens]
pl           = pred_lstm[:n_ens]

# compute weighted ensemble
pred_ens     = (w1 * px + w2 * pl) / (w1 + w2)
ens_mae      = mean_absolute_error(y_true_crop, pred_ens)
ens_rmse     = mean_squared_error(y_true_crop, pred_ens, squared=False)

print(f"Weighted Ensemble → MAE: {ens_mae:.2f}, RMSE: {ens_rmse:.2f}")


In [None]:
# — 2) AutoGluon 1‑level stacking —

stack_predictor = TabularPredictor(
    label="PriceT", problem_type="regression"
).fit(
    train_data=train_ag[ag_feats + ["PriceT"]],
    presets="best_quality",
    time_limit=120,
    ag_args_fit={
        "num_gpus": 1,
        "stack_ensemble_levels": 1
    }
)

stack_pred = stack_predictor.predict(test_ag[ag_feats])
stack_mae  = mean_absolute_error(test_ag["PriceT"], stack_pred)
stack_rmse = mean_squared_error(test_ag["PriceT"], stack_pred, squared=False)

print(f"AutoGluon Stacked → MAE: {stack_mae:.2f}, RMSE: {stack_rmse:.2f}")


In [None]:
# Cell 12: Compare All Models
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

rows = [
  ["ARIMA",          arima_mae,  arima_rmse,   np.nan,       np.nan],
  ["XGBoost(opt)",   xgb_opt_mae,xgb_opt_rmse, np.nan,       np.nan],
  ["LSTM",           lstm_mae,   lstm_rmse,    lstm_dir_acc, lstm_dir_f1],
  ["Ensemble(X+L)",  ens_mae,    ens_rmse,     np.nan,       np.nan],
  ["AutoGluonPrice", ag_price_mae,ag_price_rmse,np.nan,      np.nan],
  ["AutoGluonDir",   np.nan,     np.nan,       ag_dir_acc,   ag_dir_f1],
  ["AutoGluonVol",   ag_vol_mae, ag_vol_rmse,  np.nan,       np.nan],
  ["AGStack",        stack_mae,  stack_rmse,   np.nan,       np.nan]
]
df_res = pd.DataFrame(rows, columns=["Model","MAE","RMSE","DirAcc","DirF1"])
print(df_res.to_markdown(index=False))

plt.figure(figsize=(8,4))
plt.barh(df_res.Model, df_res.MAE)
plt.xlabel("MAE"); plt.title("Model MAE Comparison"); plt.show()


In [None]:
# Cell 13: Visualize Predictions vs Ground Truth for Each Model
import matplotlib.pyplot as plt

# 1) ARIMA (last 30 days)
plt.figure(figsize=(8,3))
plt.plot(df.index[-30:], df["Close"].iloc[-30:], label="Actual")
plt.plot(pred_arima.index,       pred_arima,        linestyle="--", label="ARIMA")
plt.title("ARIMA Forecast vs Actual (Last 30 Days)")
plt.legend()
plt.tight_layout()
plt.show()

# 2) XGBoost (test split)
plt.figure(figsize=(8,3))
plt.plot(y[split:],        label="Actual")
plt.plot(pred_xgb,         linestyle="--", label="XGBoost")
plt.title("XGBoost Predictions vs Actual")
plt.legend()
plt.tight_layout()
plt.show()

# 3) LSTM (test set)
plt.figure(figsize=(8,3))
plt.plot(yte_arr,          label="Actual")
plt.plot(pred_lstm,        linestyle="--", label="LSTM")
plt.title("LSTM Predictions vs Actual")
plt.legend()
plt.tight_layout()
plt.show()

# 4) AutoGluon Price (test_ag)
plt.figure(figsize=(8,3))
plt.plot(test_ag["PriceT"].values, label="Actual")
plt.plot(pred_price_ag,            linestyle="--", label="AutoGluon Price")
plt.title("AutoGluon Price Predictions vs Actual")
plt.legend()
plt.tight_layout()
plt.show()

# 5) Weighted Ensemble (XGB + LSTM)
plt.figure(figsize=(8,3))
plt.plot(y[split:],     label="Actual")
plt.plot(pred_ens,      linestyle="--", label="Weighted Ensemble")
plt.title("Weighted Ensemble Predictions vs Actual")
plt.legend()
plt.tight_layout()
plt.show()

# 6) AutoGluon Stacked (Price)
plt.figure(figsize=(8,3))
plt.plot(test_ag["PriceT"].values, label="Actual")
plt.plot(stack_pred,               linestyle="--", label="AutoGluon Stacked")
plt.title("AutoGluon Stacked Predictions vs Actual")
plt.legend()
plt.tight_layout()
plt.show()
