# Libraries

In [None]:
import glob

import numpy as np
import pandas as pd

# Preprocessing

In [None]:
dfs = [pd.read_parquet(f) for f in glob.glob("../data/raw/*_eod.parquet")]
df = pd.concat(dfs, ignore_index=True)
df = df[['date', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'adj_close']]
df.head()

In [None]:
df.shape

In [None]:
# Ensure date is datetime and sort by symbol and date
df.date = pd.to_datetime(df.date, utc=True)
df = df.sort_values(['symbol', 'date']).reset_index(drop=True)
df.head()

In [None]:
# Binary target: 1 if next close > current close, else 0
df["target"] = (df.groupby("symbol")["close"].shift(-1) > df["close"]).astype(int)
df = df.dropna()  # Drop rows without future data
df["target"].value_counts()  / len(df) # Check balance

# Feature Engineer

In [None]:
aapl = df[df["symbol"] == "AAPL"]
aapl_df = aapl.copy()
aapl_df.head()

## Return, ranges

In [None]:
aapl_df['high_low'] = aapl_df['high'] - aapl_df['low']
aapl_df['close_open'] = aapl_df['close'] - aapl_df['open']
aapl_df['return'] = aapl_df['close'].pct_change()
aapl_df.head()

## Lagged features

In [None]:
lag_days = [1, 2, 5, 10]

for day in lag_days:
    aapl_df[f'return_lag_{day}'] = aapl_df['return'].shift(day)

aapl_df.head()

## Rolling features

In [None]:
rolling_windows = [5, 10]

for window in rolling_windows:
    aapl_df[f'return_roll_mean_{window}'] = aapl_df['return'].rolling(window).mean()
    aapl_df[f'return_roll_std_{window}'] = aapl_df['return'].rolling(window).std()

aapl_df.head()

## Time features

In [None]:
aapl_df['day_of_week'] = aapl_df['date'].dt.weekday
aapl_df['month'] = aapl_df['date'].dt.month
aapl_df['day_of_month'] = aapl_df['date'].dt.day    
aapl_df['quarter'] = aapl_df['date'].dt.quarter
aapl_df['is_quarter_end'] = aapl_df['date'].dt.is_quarter_end.astype(int)
aapl_df.head()

## Technical features

In [None]:
# Simple moving average (SMA)
aapl_df['sma_10'] = aapl_df['close'].rolling(window=10, min_periods=10).mean()
aapl_df['sma_20'] = aapl_df['close'].rolling(window=20, min_periods=20).mean()

# Relative Strength Index (RSI)
window = 14
delta = aapl_df['close'].diff()

gain = np.where(delta > 0, delta, 0)
loss = np.where(delta < 0, -delta, 0)

avg_gain = pd.Series(gain).rolling(window=window, min_periods=window).mean()
avg_loss = pd.Series(loss).rolling(window=window, min_periods=window).mean()

rs = avg_gain / (avg_loss.replace(0, np.nan))
aapl_df['rsi_14'] = 100 - (100 / (1 + rs))

# --- EMA ---
aapl_df['ema_12'] = aapl_df['close'].ewm(span=12, adjust=False).mean()
aapl_df['ema_26'] = aapl_df['close'].ewm(span=26, adjust=False).mean()

# --- MACD and Signal Line ---
aapl_df['macd'] = aapl_df['ema_12'] - aapl_df['ema_26']
aapl_df['macd_signal'] = aapl_df['macd'].ewm(span=9, adjust=False).mean()
aapl_df.head()

In [None]:
# 1) drop the initial warm-up rows for rolling/lag features
W_DROP = 20
aapl_df_handled_nan = aapl_df.iloc[W_DROP:].copy()
print(aapl_df.shape)
print(aapl_df_handled_nan.shape)

# 2) drop any leftover NaNs (should be few; e.g., first valid RSI/MACD row)
aapl_df_handled_nan = aapl_df_handled_nan.dropna().reset_index(drop=True)
print(aapl_df_handled_nan.shape)
print(aapl_df_handled_nan.isna().sum())

# Split dataset

In [None]:
# Split dataset based on time - keep 10% test for final model evaluation
cutoff_date = df["date"].quantile(0.9)
print("Train/Test cutoff date:", cutoff_date.strftime('%Y-%m-%d'))

train_df = df[df["date"] < cutoff_date].copy()
test_df  = df[df["date"] >= cutoff_date].copy()

In [None]:
# Santy check on splits
print(train_df.groupby("symbol")["date"].min())
print(train_df.groupby("symbol")["date"].max())
print(test_df.groupby("symbol")["date"].max())
print(test_df.groupby("symbol")["date"].min())