In [31]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

# =============================
# 1️⃣ Load & preprocess dataset
# =============================
folder_path = "processed_datasets"  # CSV folder
SEQ_LEN = 8  # number of past days per input
TARGET_COL = "Close"
FEATURE_COLS = ["Open", "High", "Low", "Close", "Volume", "EMA_7", "EMA_21", "sentiment_score"]

dfs = []
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(folder_path, file))
        df["Company"] = file.split(".")[0]
        dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
data = data.sort_values(["Company", "Date"]).reset_index(drop=True)
data = data.dropna(subset=FEATURE_COLS + [TARGET_COL])

# =============================
# 2️⃣ Create sequences per company
# =============================
X_train_list, y_train_list = [], []
X_test_list, y_test_list, companies_test_list = [], [], []

for company in data["Company"].unique():
    df_c = data[data["Company"] == company]
    vals = df_c[FEATURE_COLS].values
    target = df_c[TARGET_COL].values

    X_c, y_c = [], []
    for i in range(len(vals) - SEQ_LEN):
        X_c.append(vals[i:i+SEQ_LEN])
        y_c.append(target[i+SEQ_LEN])
    
    X_c = np.array(X_c, dtype=np.float32)
    y_c = np.array(y_c, dtype=np.float32).reshape(-1,1)
    
    # Split per company
    split_idx = int(0.8 * len(X_c))
    X_train_list.append(X_c[:split_idx])
    y_train_list.append(y_c[:split_idx])
    
    X_test_list.append(X_c[split_idx:])
    y_test_list.append(y_c[split_idx:])
    companies_test_list.extend([company] * (len(X_c) - split_idx))

# Combine all companies
X_train = np.concatenate(X_train_list, axis=0)
y_train = np.concatenate(y_train_list, axis=0)
X_test = np.concatenate(X_test_list, axis=0)
y_test = np.concatenate(y_test_list, axis=0)
companies_test = np.array(companies_test_list)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# =============================
# 3️⃣ Normalize features
# =============================
scaler_X = StandardScaler()
scaler_y = StandardScaler()

# Flatten for scaler
X_flat = X_train.reshape(-1, X_train.shape[2])
X_train_scaled = scaler_X.fit_transform(X_flat).reshape(X_train.shape)
X_test_scaled = scaler_X.transform(X_test.reshape(-1, X_test.shape[2])).reshape(X_test.shape)
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

# =============================
# 4️⃣ Create DataLoaders
# =============================
train_ds = TensorDataset(torch.tensor(X_train_scaled), torch.tensor(y_train_scaled))
test_ds = TensorDataset(torch.tensor(X_test_scaled), torch.tensor(y_test_scaled))

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

# =============================
# 5️⃣ Transformer model
# =============================
class StockTransformer(nn.Module):
    def __init__(self, feature_dim, seq_len, d_model=64, nhead=4, num_layers=2, dim_feedforward=128):
        super().__init__()
        self.input_proj = nn.Linear(feature_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.regressor = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.input_proj(x)
        x = self.transformer(x)
        x = x[:, -1, :]  # Take last time step
        return self.regressor(x)

input_dim = X_train.shape[2]
model = StockTransformer(feature_dim=input_dim, seq_len=SEQ_LEN)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# =============================
# 6️⃣ Training setup
# =============================
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
EPOCHS = 20

# =============================
# 7️⃣ Training loop
# =============================
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        if torch.isnan(loss):
            raise ValueError("NaN loss detected! Check your data.")
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * xb.size(0)
    epoch_loss /= len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {epoch_loss:.6f}")

# =============================
# 8️⃣ Evaluation per company
# =============================
model.eval()
preds, actuals = [], []

with torch.no_grad():
    for i, (xb, yb) in enumerate(test_loader):
        xb, yb = xb.to(device), yb.to(device)
        out = model(xb)
        preds.extend(out.squeeze().cpu().tolist())
        actuals.extend(yb.squeeze().cpu().tolist())

# Inverse transform
preds = scaler_y.inverse_transform(np.array(preds).reshape(-1,1)).squeeze()
actuals = scaler_y.inverse_transform(np.array(actuals).reshape(-1,1)).squeeze()

# Create DataFrame
test_df = pd.DataFrame({
    "Company": companies_test,
    "Actual": actuals,
    "Predicted": preds,
    "Absolute_Error": np.abs(actuals - preds)
})

# =============================
# 9️⃣ Save per-company predictions & metrics
# =============================
os.makedirs("predictions", exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
company_stats = []

for company, grp in test_df.groupby("Company"):
    c_rmse = np.sqrt(mean_squared_error(grp["Actual"], grp["Predicted"]))
    c_mae = mean_absolute_error(grp["Actual"], grp["Predicted"])
    c_r2 = r2_score(grp["Actual"], grp["Predicted"])
    
    csv_path = f"predictions/{company}_predictions_{timestamp}.csv"
    grp.to_csv(csv_path, index=False)
    
    company_stats.append({
        "Company": company,
        "RMSE": c_rmse,
        "MAE": c_mae,
        "R2": c_r2,
        "Records": len(grp)
    })
    print(f"📁 Saved predictions for {company} → {csv_path}")

# Save summary metrics
summary_df = pd.DataFrame(company_stats)
summary_path = f"predictions/company_metrics_summary_{timestamp}.csv"
summary_df.to_csv(summary_path, index=False)
print(f"\n📊 Company metrics summary saved to: {summary_path}")

# =============================
# 🔟 Save model
# =============================
os.makedirs("models", exist_ok=True)
model_path = f"models/transformer_stock_model_{timestamp}.pth"

torch.save({
    'model_state_dict': model.state_dict(),
    'scaler_X': scaler_X,
    'scaler_y': scaler_y
}, model_path)

print(f"✅ Model saved to {model_path}")

Train shape: (7960, 8, 8), Test shape: (1990, 8, 8)
Epoch 1/20 | Loss: 0.120755
Epoch 2/20 | Loss: 0.025985
Epoch 3/20 | Loss: 0.017158
Epoch 4/20 | Loss: 0.013801
Epoch 5/20 | Loss: 0.011686
Epoch 6/20 | Loss: 0.010528
Epoch 7/20 | Loss: 0.009550
Epoch 8/20 | Loss: 0.009175
Epoch 9/20 | Loss: 0.008468
Epoch 10/20 | Loss: 0.008095
Epoch 11/20 | Loss: 0.007810
Epoch 12/20 | Loss: 0.007515
Epoch 13/20 | Loss: 0.007120
Epoch 14/20 | Loss: 0.006812
Epoch 15/20 | Loss: 0.006907
Epoch 16/20 | Loss: 0.006593
Epoch 17/20 | Loss: 0.006381
Epoch 18/20 | Loss: 0.006196
Epoch 19/20 | Loss: 0.006114
Epoch 20/20 | Loss: 0.006035
📁 Saved predictions for AAPL_stock_gdelt_final_merged → predictions/AAPL_stock_gdelt_final_merged_predictions_20251022_203613.csv
📁 Saved predictions for AMZN_stock_gdelt_final_merged → predictions/AMZN_stock_gdelt_final_merged_predictions_20251022_203613.csv
📁 Saved predictions for GOOG_stock_gdelt_final_merged → predictions/GOOG_stock_gdelt_final_merged_predictions_2025102

Miscellanous

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils import shuffle

# =============================
# Config
# =============================
RAW_FOLDER = "processed_datasets"       # Folder with your processed CSVs
OUTPUT_FOLDER = "processed_splits"      # Where to save datasets
SEQ_LEN = 5                             # Number of past days in a sequence
BATCH_SIZE = 64
TARGET_COL = "Close"                     # Or "Daily_Return" if you prefer

FEATURE_COLS = ["Open","High","Low","Close","Volume","EMA_7","EMA_21","sentiment_score"]

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# =============================
# 1️⃣ Load all CSVs
# =============================
dfs = []
for file in os.listdir(RAW_FOLDER):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(RAW_FOLDER, file))
        df["Company"] = file.split(".")[0]
        df = df.sort_values("Date").reset_index(drop=True)
        dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
print("✅ Combined dataset shape:", data.shape)

# =============================
# 2️⃣ Create sequences per company
# =============================
X_seq_all, y_seq_all = [], []

for company in data['Company'].unique():
    df_c = data[data['Company']==company].sort_values("Date")
    df_c = df_c.dropna(subset=FEATURE_COLS + [TARGET_COL])
    
    X_c = df_c[FEATURE_COLS].values.astype(np.float32)
    y_c = df_c[TARGET_COL].values.astype(np.float32).reshape(-1,1)
    
    for i in range(len(X_c) - SEQ_LEN):
        X_seq_all.append(X_c[i:i+SEQ_LEN])
        y_seq_all.append(y_c[i+SEQ_LEN])

X_seq_all = np.array(X_seq_all)
y_seq_all = np.array(y_seq_all)

print(f"✅ Sequences created: X_seq_all={X_seq_all.shape}, y_seq_all={y_seq_all.shape}")

# =============================
# 3️⃣ Shuffle sequences
# =============================
X_seq_all, y_seq_all = shuffle(X_seq_all, y_seq_all, random_state=42)
print("✅ Sequences shuffled")

# =============================
# 4️⃣ Train-test split
# =============================
split_idx = int(0.8 * len(X_seq_all))
X_train, y_train = X_seq_all[:split_idx], y_seq_all[:split_idx]
X_test, y_test = X_seq_all[split_idx:], y_seq_all[split_idx:]

# =============================
# 5️⃣ Convert to PyTorch tensors & DataLoaders
# =============================
train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                         torch.tensor(y_train, dtype=torch.float32))
test_ds = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                        torch.tensor(y_test, dtype=torch.float32))

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

print("✅ DataLoaders created successfully")

# =============================
# 6️⃣ Save datasets
# =============================
torch.save(train_ds, os.path.join(OUTPUT_FOLDER, "train_dataset.pt"))
torch.save(test_ds, os.path.join(OUTPUT_FOLDER, "test_dataset.pt"))

# Optional CSV for inspection: flatten sequences
train_flat = X_train.reshape(X_train.shape[0], -1)
test_flat = X_test.reshape(X_test.shape[0], -1)

train_df = pd.DataFrame(train_flat, columns=[f"{c}_t{i}" for i in range(SEQ_LEN) for c in FEATURE_COLS])
train_df[TARGET_COL] = y_train
test_df = pd.DataFrame(test_flat, columns=[f"{c}_t{i}" for i in range(SEQ_LEN) for c in FEATURE_COLS])
test_df[TARGET_COL] = y_test

train_df.to_csv(os.path.join(OUTPUT_FOLDER, "train_data.csv"), index=False)
test_df.to_csv(os.path.join(OUTPUT_FOLDER, "test_data.csv"), index=False)

print(f"📁 Saved datasets in '{OUTPUT_FOLDER}' folder:")
print(" ├── train_dataset.pt")
print(" ├── test_dataset.pt")
print(" ├── train_data.csv")
print(" └── test_data.csv")
import torch
import torch.nn as nn

# =============================
# 4️⃣ Model definition
# =============================
class StockSentimentModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=512):
        super().__init__()

        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, x):
        return self.network(x)

input_dim = X_train.shape[1]
model = StockSentimentModel(input_dim=input_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch

# =============================
# 1️⃣ Run predictions
# =============================
model.eval()
preds, actuals = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        xb_flat = xb.view(xb.size(0), -1)  # flatten before passing to model
        out = model(xb_flat)
        preds.extend(out.squeeze().tolist())
        actuals.extend(yb.squeeze().tolist())

preds = np.array(preds)
actuals = np.array(actuals)

# =============================
# 2️⃣ Overall metrics
# =============================
rmse = np.sqrt(mean_squared_error(actuals, preds))
mae = mean_absolute_error(actuals, preds)
r2 = r2_score(actuals, preds)

print(f"\n✅ Overall RMSE: {rmse:.4f}")
print(f"✅ Overall MAE: {mae:.4f}")
print(f"✅ Overall R² Score: {r2:.4f}")

# =============================
# 3️⃣ Prepare test DataFrame for per-company metrics
# =============================
# Create a record for each test sequence
test_records = []

# Optional: if you tracked 'Company' and 'Date' when creating sequences, you can fill them here
for i in range(len(X_test)):
    last_day_features = X_test[i, -1, :]  # last day in the sequence
    # If you encoded Company as numeric, you can reverse map or just keep numeric
    company_val = last_day_features[0] if last_day_features.shape[0] > 0 else "Unknown"
    test_records.append({
        "Company": company_val,
        "Date": "Unknown",  # Replace with actual date if available
        "Actual": actuals[i],
        "Predicted": preds[i],
        "Absolute_Error": abs(actuals[i] - preds[i])
    })

test_df = pd.DataFrame(test_records)

# =============================
# 4️⃣ Per-company metrics & CSV saving
# =============================
pred_dir = "predictions"
os.makedirs(pred_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

company_stats = []
for company, grp in test_df.groupby("Company"):
    c_rmse = np.sqrt(mean_squared_error(grp["Actual"], grp["Predicted"]))
    c_mae = mean_absolute_error(grp["Actual"], grp["Predicted"])
    c_r2 = r2_score(grp["Actual"], grp["Predicted"])

    # Save CSV for this company
    csv_path = os.path.join(pred_dir, f"{company}_predictions_{timestamp}.csv")
    grp.to_csv(csv_path, index=False)

    company_stats.append({
        "Company": company,
        "RMSE": c_rmse,
        "MAE": c_mae,
        "R2": c_r2,
        "Records": len(grp)
    })
    print(f"📁 Saved predictions for {company} → {csv_path}")

# Save summary CSV
summary_df = pd.DataFrame(company_stats)
summary_path = os.path.join(pred_dir, f"company_metrics_summary_{timestamp}.csv")
summary_df.to_csv(summary_path, index=False)
print(f"\n📊 Per-company metrics saved to: {summary_path}")

# =============================
# 5️⃣ Save the trained model
# =============================
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)

model_path = os.path.join(model_dir, f"stock_sentiment_model_{timestamp}.pth")
torch.save({
    'model_state_dict': model.state_dict(),
    'rmse': rmse,
    'mae': mae,
    'r2': r2,
    'input_dim': X_test.shape[2]  # number of features per day
}, model_path)

print(f"✅ Model saved successfully to: {model_path}")

KeyboardInterrupt: 