In [14]:
pip install requests beautifulsoup4 pandas scikit-learn numpy

Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os

# =========================
# CONFIG
# =========================
INPUT_PATH = "../../data/Market_Trend_External.csv"

OUTPUT_JOINT      = "../../data/joint_data_collection.csv"
OUTPUT_TRAIN      = "../../data/training_data.csv"
OUTPUT_TEST       = "../../data/test_data.csv"
OUTPUT_ACTIVATION = "../../data/activation_data.csv"

# =========================
# STEP 1 — Load Raw Dataset
# =========================
df = pd.read_csv(INPUT_PATH)
print("Raw dataset shape:", df.shape)

# =========================
# STEP 2 — Data Cleaning
# =========================
numeric_cols = [
    "Open_Price", "High_Price", "Low_Price", "Close_Price",
    "Volume", "Daily_Return_Pct", "Volatility_Range",
    "VIX_Close", "Sentiment_Score",
    "GeoPolitical_Risk_Score", "Currency_Index"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["Economic_News_Flag"] = df["Economic_News_Flag"].astype(int)
df["Federal_Rate_Change_Flag"] = df["Federal_Rate_Change_Flag"].astype(int)

df.dropna(inplace=True)
print("After cleaning:", df.shape)

# =========================
# STEP 3 — Algorithmic Outlier Removal (IQR)
# =========================
Q1 = df["Close_Price"].quantile(0.25)
Q3 = df["Close_Price"].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df = df[(df["Close_Price"] >= lower) & (df["Close_Price"] <= upper)]
print("After outlier removal:", df.shape)

# =========================
# STEP 4 — Algorithmic Normalization
# =========================
price_scaler = MinMaxScaler()
df["Close_Price_Normalized"] = price_scaler.fit_transform(df[["Close_Price"]])

joblib.dump(price_scaler, "../../data/close_price_scaler.pkl")   # ⭐ add this

# =========================
# STEP 5 — Save Joint Dataset
# =========================
df.to_csv(OUTPUT_JOINT, index=False)
print("Saved:", OUTPUT_JOINT)

# =========================
# STEP 6 — Train/Test Split
# =========================
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.to_csv(OUTPUT_TRAIN, index=False)
test_df.to_csv(OUTPUT_TEST, index=False)

print("Saved:", OUTPUT_TRAIN)
print("Saved:", OUTPUT_TEST)

# =========================
# STEP 7 — Activation Dataset (1 row from test)
# =========================
activation_df = test_df.sample(n=1, random_state=42)
activation_df.to_csv(OUTPUT_ACTIVATION, index=False)

print("Saved:", OUTPUT_ACTIVATION)

print("\n✅ Subgoal 2 Completed Successfully")


Raw dataset shape: (30000, 14)
After cleaning: (30000, 14)
After outlier removal: (30000, 14)
Saved: ../../data/joint_data_collection.csv
Saved: ../../data/training_data.csv
Saved: ../../data/test_data.csv
Saved: ../../data/activation_data.csv

✅ Subgoal 2 Completed Successfully


In [16]:
import pandas as pd

df = pd.read_csv("../../data/Market_Trend_External.csv")
print(df.columns.tolist())


['Date', 'Open_Price', 'Close_Price', 'High_Price', 'Low_Price', 'Volume', 'Daily_Return_Pct', 'Volatility_Range', 'VIX_Close', 'Economic_News_Flag', 'Sentiment_Score', 'Federal_Rate_Change_Flag', 'GeoPolitical_Risk_Score', 'Currency_Index']
