### Feature Engineering for ARIMA & Gradient Boosting
Since ARIMA and Gradient Boosting (e.g., XGBoost) require different types of features, I'll create two separate feature engineering pipelines:

1️ For ARIMA (Time-Series Specific Features)

2️ For Gradient Boosting (Lagged Features, Rolling Statistics, and Volatility Metrics)

ARIMA Requires:

Stationary Data (ARIMA assumes the series is stationary)

Lagged Values (Previous stock prices)

Differencing (To remove trends)

Seasonal Features (e.g., month, day of week)




In [2]:
import pandas as pd
import numpy as np

# **Feature Engineering for ARIMA**
def prepare_arima_features(df: pd.DataFrame, stock: str) -> pd.DataFrame:
    """
    Feature engineering for ARIMA models. Prepares a stationary time series by:
    - Adding differencing
    - Creating seasonal features (month, day-of-week)
    - Keeping only necessary columns
    """
    col_name = f"{stock}_Close"
    
    if col_name not in df.columns:
        raise ValueError(f"Column {col_name} not found in the dataset!")
    
    df["Date"] = pd.to_datetime(df["Date"])
    df.set_index("Date", inplace=True)
    
    # **Apply differencing to remove trends**
    df[f"{stock}_Close_Diff_1"] = df[col_name].diff(1)  # First-order differencing
    df[f"{stock}_Close_Diff_2"] = df[col_name].diff(2)  # Second-order differencing
    
    # **Create Seasonal Features**
    df["Month"] = df.index.month
    df["Day_of_Week"] = df.index.dayofweek
    
    # **Drop NA values (from differencing)**
    df.dropna(inplace=True)
    
    return df[[f"{stock}_Close_Diff_1", f"{stock}_Close_Diff_2", "Month", "Day_of_Week"]]

if __name__ == "__main__":
    df = pd.read_csv("../data/processed/final_merged_stock_data.csv")
    selected_stock = "AAPL"  # Modify this to select a stock
    df_arima = prepare_arima_features(df, selected_stock)
    
    df_arima.to_csv(f"../data/feature_engineering data/{selected_stock}_arima_features.csv")
    print(f"✅ ARIMA Features saved: '../data/processed/{selected_stock}_arima_features.csv'")


✅ ARIMA Features saved: '../data/processed/AAPL_arima_features.csv'


### Gradient Boosting Requires:

Lagged Features (e.g., past 1, 5, 10-day prices)

Rolling Statistics (e.g., rolling mean, rolling std)

Volatility Metrics (e.g., ATR, Percentage Change)

Market Sentiment Features (if available)

In [5]:
import pandas as pd
import os

# Load merged stock dataset
data_path = "../data/processed/final_merged_stock_data.csv"  # Ensure this file exists!
df = pd.read_csv(data_path)

# Extract all stock symbols dynamically
stock_symbols = [col.split("_")[0] for col in df.columns if "_Close" in col]
print(f"✅ Detected stock symbols: {stock_symbols}")

# Ensure the output directory exists
feature_dir = "../data/feature_engineering data"
os.makedirs(feature_dir, exist_ok=True)

# 🛠️ Feature Engineering Functions

# Lagged Features
def add_lag_features(df, stock, lags=[1, 3, 5, 10, 20]):
    col_name = f"{stock}_Close"
    for lag in lags:
        df[f"{stock}_Close_Lag_{lag}"] = df[col_name].shift(lag)
    return df

# Rolling Statistics
def add_rolling_stats(df, stock, windows=[7, 14, 30]):
    col_name = f"{stock}_Close"
    for window in windows:
        df[f"{stock}_Close_RollMean_{window}"] = df[col_name].rolling(window).mean()
        df[f"{stock}_Close_RollStd_{window}"] = df[col_name].rolling(window).std()
        df[f"{stock}_Close_EWMA_{window}"] = df[col_name].ewm(span=window, adjust=False).mean()
    return df

# Volatility & Momentum Features
def add_volatility_features(df, stock):
    col_name = f"{stock}_Close"
    df[f"{stock}_Close_PctChange"] = df[col_name].pct_change()
    df[f"{stock}_High_Low"] = df[f"{stock}_High"] - df[f"{stock}_Low"]
    df[f"{stock}_ATR_14"] = df[f"{stock}_High_Low"].rolling(14).mean()
    df[f"{stock}_Bollinger_High"] = df[f"{stock}_Close"].rolling(20).mean() + (df[f"{stock}_Close"].rolling(20).std() * 2)
    df[f"{stock}_Bollinger_Low"] = df[f"{stock}_Close"].rolling(20).mean() - (df[f"{stock}_Close"].rolling(20).std() * 2)
    return df

# Momentum Features
def add_momentum_features(df, stock):
    col_name = f"{stock}_Close"
    delta = df[col_name].diff()
    gain = (delta.where(delta > 0, 0)).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / loss
    df[f"{stock}_RSI_14"] = 100 - (100 / (1 + rs))
    short_ema = df[col_name].ewm(span=12, adjust=False).mean()
    long_ema = df[col_name].ewm(span=26, adjust=False).mean()
    df[f"{stock}_MACD"] = short_ema - long_ema
    df[f"{stock}_MACD_Signal"] = df[f"{stock}_MACD"].ewm(span=9, adjust=False).mean()
    return df

# Volume-Based Features
def add_volume_features(df, stock):
    col_name = f"{stock}_Close"
    df[f"{stock}_VWAP"] = (df[col_name] * df[f"{stock}_Volume"]).cumsum() / df[f"{stock}_Volume"].cumsum()
    df[f"{stock}_VROC_10"] = ((df[f"{stock}_Volume"] - df[f"{stock}_Volume"].shift(10)) / df[f"{stock}_Volume"].shift(10)) * 100
    return df

# **Final Feature Engineering Pipeline**
def prepare_boosting_features(df, stock):
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"])
    df.sort_values(by="Date", inplace=True)
    
    df = add_lag_features(df, stock)
    df = add_rolling_stats(df, stock)
    df = add_volatility_features(df, stock)
    df = add_momentum_features(df, stock)
    df = add_volume_features(df, stock)

    df.dropna(inplace=True)  # Drop missing values

    feature_cols = [col for col in df.columns if stock in col]
    return df[["Date"] + feature_cols]

# **Generate Features for Each Stock**
for stock in stock_symbols:
    df_features = prepare_boosting_features(df, stock)
    feature_file = os.path.join(feature_dir, f"{stock}_boosting_features.csv")
    df_features.to_csv(feature_file, index=False)
    print(f"✅ {stock}_boosting_features.csv created!")

print("\n🚀 All feature-engineered datasets have been successfully generated!")


✅ Boosting Features saved: '../data/feature_engineering data/AAPL_boosting_features.csv'
