In [1]:
# notebooks/03_data_merging.ipynb

import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('..'))

# --- FIX: IMPORT THE CORRECT FUNCTION NAME ---
from src.utils import engineer_features 
# ---------------------------------------------

# 1. SETUP & LOAD
print("Loading data...")
df_prices = pd.read_csv('../data/raw/hsi_price_history.csv')
df_sentiment = pd.read_csv('../data/processed/daily_sentiment.csv')

# ... (rest of your cleanup code for Ticker/MultiIndex remains the same) ...

# Convert dates
df_prices['Date'] = pd.to_datetime(df_prices['Date']).dt.tz_localize(None)
df_sentiment['Date'] = pd.to_datetime(df_sentiment['Date'])

# 2. MERGE
df_merged = pd.merge(df_prices, df_sentiment, on='Date', how='left')
df_merged['sentiment_score'] = df_merged['sentiment_score'].fillna(0)

# 3. ADD INDICATORS
print("Adding technical indicators...")
numeric_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
for col in numeric_cols:
    if col in df_merged.columns:
        df_merged[col] = pd.to_numeric(df_merged[col], errors='coerce')

# --- FIX: CALL THE CORRECT FUNCTION ---
df_processed = engineer_features(df_merged)
# --------------------------------------

# Drop NaNs
df_processed.dropna(inplace=True)

# 4. SCALING
# Calculate the percentage change (Return)
df_processed['Return'] = df_processed['Close'].pct_change()

# The Target is NEXT day's return
df_processed['Target'] = (df_processed['Return'].shift(-1) > 0).astype(int)

# Drop NaNs created by shift/pct_change
df_processed.dropna(inplace=True)

# Save RAW data
df_processed.to_csv('../data/processed/training_data.csv', index=False)
print(f"Success! Saved {len(df_processed)} rows with Returns target.")


Using Apple Silicon MPS
Loading data...
Adding technical indicators...
Success! Saved 2657 rows with Returns target.
