# Cryptocurrency Volatility Prediction

**Notebook**: reproducible pipeline for data, features, EDA, modeling, and deployment snippets.

**Author**: Sanskriti Jaiswal

---

In [None]:
# Basic imports and configuration
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

# Display options
pd.set_option('display.max_columns', 200)
DATA_PATH = Path('data/crypto_daily.csv')  # <-- put your CSV here


## 1) Load data
Load dataset (daily OHLCV + market cap).

In [None]:
# Load dataset
if DATA_PATH.exists():
    df = pd.read_csv(DATA_PATH, parse_dates=['date'])
    print('Loaded:', df.shape)
    display(df.head())
else:
    print(f'File not found: {DATA_PATH}. Please place the CSV at this path and re-run.')

## 2) Preprocessing helper functions
- Handle missing values
- Type casting
- Basic cleaning

In [None]:
def preprocess_basic(df):
    df = df.copy()
    # ensure datetime and sort
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(['symbol','date']).reset_index(drop=True)
    # forward/backward fill small missing runs per symbol
    df[['open','high','low','close','volume','market_cap']] = df.groupby('symbol')[['open','high','low','close','volume','market_cap']].apply(lambda g: g.fillna(method='ffill').fillna(method='bfill'))
    # compute simple returns
    df['return'] = df.groupby('symbol')['close'].pct_change()
    return df

# Example usage:
# df = preprocess_basic(df)


## 3) Feature engineering
Create volatility and liquidity related features (rolling std, ATR, Bollinger Bands, liquidity ratio).

In [None]:
def add_features(df, win=14):
    df = df.copy()
    # Rolling std of returns as a volatility proxy
    df['vol_rolling_std'] = df.groupby('symbol')['return'].rolling(window=win, min_periods=1).std().reset_index(0,drop=True)
    # ATR (Average True Range)
    high_low = df['high'] - df['low']
    high_prevclose = (df['high'] - df.groupby('symbol')['close'].shift(1)).abs()
    low_prevclose = (df['low'] - df.groupby('symbol')['close'].shift(1)).abs()
    df['true_range'] = pd.concat([high_low, high_prevclose, low_prevclose], axis=1).max(axis=1)
    df['atr'] = df.groupby('symbol')['true_range'].rolling(window=win, min_periods=1).mean().reset_index(0,drop=True)
    # Bollinger Bands (on close)
    df['ma'] = df.groupby('symbol')['close'].rolling(window=win, min_periods=1).mean().reset_index(0,drop=True)
    df['bb_std'] = df.groupby('symbol')['close'].rolling(window=win, min_periods=1).std().reset_index(0,drop=True)
    df['bb_upper'] = df['ma'] + 2 * df['bb_std']
    df['bb_lower'] = df['ma'] - 2 * df['bb_std']
    # Liquidity ratio
    df['liquidity'] = df['volume'] / (df['market_cap'] + 1e-9)
    # lag features
    for lag in [1,2,3,7,14]:
        df[f'return_lag_{lag}'] = df.groupby('symbol')['return'].shift(lag)
    # target: next-day rolling volatility (for supervised learning)
    df['target_vol_7'] = df.groupby('symbol')['vol_rolling_std'].shift(-1)  # example: next day volatility proxy
    return df

# Example:
# df = add_features(df, win=14)


## 4) Exploratory Data Analysis (EDA)
Plot example symbol, distributions, correlations.

In [None]:
def plot_example_symbol(df, symbol='BTC'):
    d = df[df['symbol'] == symbol].copy().set_index('date')
    fig, ax = plt.subplots(3,1, figsize=(12,10), sharex=True)
    d['close'].plot(ax=ax[0], title=f'{symbol} Close Price')
    d['vol_rolling_std'].plot(ax=ax[1], title=f'{symbol} Rolling Volatility (std)')
    d['atr'].plot(ax=ax[2], title=f'{symbol} ATR')
    plt.tight_layout()
    plt.show()

# Example usage:
# plot_example_symbol(df, 'BTC')


## 5) Modeling
Time-aware split -> RandomForest baseline -> metrics (RMSE, MAE, R2).

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

def prepare_dataset(df, symbol='BTC', feature_cols=None, target_col='target_vol_7'):
    d = df[df['symbol'] == symbol].copy().dropna(subset=[target_col])
    if feature_cols is None:
        feature_cols = ['vol_rolling_std','atr','liquidity','ma'] + [c for c in d.columns if 'return_lag' in c]
    X = d[feature_cols]
    y = d[target_col]
    # time-based split
    split = int(0.8 * len(d))
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]
    return X_train, X_test, y_train, y_test

# Example quick baseline training function
def train_rf(X_train, y_train):
    model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    return model

# Example evaluation
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    return {'rmse': rmse, 'mae': mae, 'r2': r2}

# Save model
def save_model(model, path='models/rf_vol_model.pkl'):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(model, path)
    print('Saved model to', path)


In [None]:
# Example end-to-end small-run (only runs when data exists)
if 'df' in globals() and not df.empty:
    df = preprocess_basic(df)
    df = add_features(df, win=14)
    symbol = df['symbol'].unique()[0]
    X_train, X_test, y_train, y_test = prepare_dataset(df, symbol=symbol)
    print('Train/Test sizes:', X_train.shape, X_test.shape)
    model = train_rf(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    print('Metrics:', metrics)
    save_model(model, path='models/rf_vol_model.pkl')
else:
    print('Dataset not loaded; skip model run.')


## 6) Deployment (Streamlit demo)
Simple Streamlit app snippet that loads saved model and predicts from user inputs.

In [None]:
# Save this as src/app.py for a simple Streamlit interface
streamlit_app_code = '''import streamlit as st
import pandas as pd
import joblib
from pathlib import Path

st.title('Crypto Volatility Predictor — Demo')
model_path = Path('models/rf_vol_model.pkl')
if not model_path.exists():
    st.warning('Model artifact not found. Train and save a model first.')
else:
    model = joblib.load(model_path)
    st.sidebar.header('Input features')
    vol_rolling_std = st.sidebar.number_input('vol_rolling_std', value=0.02)
    atr = st.sidebar.number_input('atr', value=10.0)
    liquidity = st.sidebar.number_input('liquidity', value=1e-6, format='%.8f')
    ma = st.sidebar.number_input('ma', value=30000.0)
    # create df
    X = pd.DataFrame([[vol_rolling_std, atr, liquidity, ma]], columns=['vol_rolling_std','atr','liquidity','ma'])
    pred = model.predict(X)[0]
    st.metric('Predicted next-day volatility (proxy)', f'{pred:.6f}')
'''
print('Streamlit app snippet saved to src/app.py')

In [None]:
# write the streamlit snippet to file
Path('src').mkdir(parents=True, exist_ok=True)
with open('src/app.py', 'w', encoding='utf-8') as f:
    f.write(streamlit_app_code)
print('Wrote src/app.py')

## 7) Conclusion & Next Steps
- This notebook provides a reproducible baseline pipeline. 
- Next: advanced time-series CV, XGBoost tuning, LSTM/Transformer experiments, probabilistic forecasts, and production deployment.

---

**Good luck — customize the notebook with your dataset and tune models for improved performance!**