In [None]:
# notebooks/03_data_merging.ipynb

import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('..'))
from src.utils import add_technical_indicators

# Load data
df_prices = pd.read_csv('../data/raw/hsi_price_history.csv')
df_sentiment = pd.read_csv('../data/processed/daily_sentiment.csv')

# Convert dates
df_prices['Date'] = pd.to_datetime(df_prices['Date']).dt.tz_localize(None) # Remove timezone if present
df_sentiment['Date'] = pd.to_datetime(df_sentiment['Date'])

# Merge (Left join to keep all trading days, fill missing sentiment with 0)
df_merged = pd.merge(df_prices, df_sentiment, on='Date', how='left')
df_merged['sentiment_score'] = df_merged['sentiment_score'].fillna(0)

# Add Technical Indicators
df_processed = add_technical_indicators(df_merged)

# Drop NaNs created by rolling windows (e.g., MA_50 introduces 49 NaNs)
df_processed.dropna(inplace=True)

# Scaling
# We scale all numerical features to 0-1
feature_cols = [col for col in df_processed.columns if col not in ['Date', 'Headline']]
scaler = MinMaxScaler()
df_processed[feature_cols] = scaler.fit_transform(df_processed[feature_cols])

# Save Scaler for Inference later
os.makedirs('../data/processed/scalers', exist_ok=True)
joblib.dump(scaler, '../data/processed/scalers/price_scaler.pkl')

# Save Final Training Data
df_processed.to_csv('../data/processed/training_data.csv', index=False)
print(f"Success! Saved {len(df_processed)} rows to data/processed/training_data.csv")
print("You can now run 'python train.py'")
