In [None]:
# 1. Setup and Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset
    print(f"‚úÖ PyTorch {torch.__version__} loaded")
    print(f"   CUDA available: {torch.cuda.is_available()}")
except ImportError:
    print("‚ö†Ô∏è Installing PyTorch...")
    import subprocess
    subprocess.run(['pip', 'install', 'torch'], check=True)
    import torch
    import torch.nn as nn

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
print("\n‚úÖ All libraries loaded!")

In [None]:
# 2. Configuration
BASE_DIR = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
DATA_DIR = BASE_DIR / "data" / "exported_data" / "per_asset"
OUTPUT_DIR = BASE_DIR / "models" / "lstm"
RESULTS_DIR = BASE_DIR / "results" / "lstm"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Assets to train
ASSETS = ["AAPL", "NVDA", "TSLA", "MSFT", "GOOGL", "AMZN", "META",
          "SPY", "QQQ", "EFA", "IEF", "HYG", "BIL", "INTC", "AMD"]

# LSTM Configuration
LSTM_CONFIG = {
    'sequence_length': 20,      # 20 days of history
    'hidden_size': 64,          # LSTM hidden units
    'num_layers': 2,            # LSTM layers
    'dropout': 0.2,             # Dropout rate
    'bidirectional': False,     # Bidirectional LSTM
    'attention': True,          # Use attention mechanism
    'batch_size': 32,
    'learning_rate': 0.001,
    'epochs': 50,
    'early_stopping_patience': 10
}

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üìÇ Output directory: {OUTPUT_DIR}")
print(f"üñ•Ô∏è Device: {device}")
print(f"\n‚öôÔ∏è LSTM Configuration:")
for k, v in LSTM_CONFIG.items():
    print(f"   {k}: {v}")

## 3. Data Loading and Preprocessing

In [None]:
def load_asset_data(asset):
    """Load and combine train/val/test data for an asset."""
    asset_dir = DATA_DIR / asset
    
    X_train = pd.read_csv(asset_dir / "X_train.csv", index_col=0, parse_dates=True)
    X_val = pd.read_csv(asset_dir / "X_val.csv", index_col=0, parse_dates=True)
    X_test = pd.read_csv(asset_dir / "X_test.csv", index_col=0, parse_dates=True)
    
    y_train = pd.read_csv(asset_dir / "y_train.csv", index_col=0, parse_dates=True).squeeze()
    y_val = pd.read_csv(asset_dir / "y_val.csv", index_col=0, parse_dates=True).squeeze()
    y_test = pd.read_csv(asset_dir / "y_test.csv", index_col=0, parse_dates=True).squeeze()
    
    return X_train, X_val, X_test, y_train, y_val, y_test

def create_sequences(X, y, seq_length):
    """Create sequences for LSTM input."""
    X_seq, y_seq = [], []
    
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])
    
    return np.array(X_seq), np.array(y_seq)

# Test loading
asset = 'AAPL'
X_train, X_val, X_test, y_train, y_val, y_test = load_asset_data(asset)
print(f"‚úÖ {asset} data loaded:")
print(f"   Train: {X_train.shape}")
print(f"   Val: {X_val.shape}")
print(f"   Test: {X_test.shape}")

## 4. LSTM Model with Attention

In [None]:
class Attention(nn.Module):
    """Attention mechanism for sequence models."""
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1)
        )
    
    def forward(self, lstm_output):
        # lstm_output: (batch, seq_len, hidden_size)
        attention_weights = self.attention(lstm_output)  # (batch, seq_len, 1)
        attention_weights = torch.softmax(attention_weights, dim=1)
        
        # Weighted sum
        context = torch.sum(attention_weights * lstm_output, dim=1)  # (batch, hidden_size)
        
        return context, attention_weights


class LSTMPredictor(nn.Module):
    """LSTM model for financial time series prediction."""
    
    def __init__(self, input_size, hidden_size=64, num_layers=2, 
                 dropout=0.2, bidirectional=False, use_attention=True):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.use_attention = use_attention
        
        # LSTM layers
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional
        )
        
        # Output size after LSTM
        lstm_output_size = hidden_size * (2 if bidirectional else 1)
        
        # Attention (optional)
        if use_attention:
            self.attention = Attention(lstm_output_size)
        
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(lstm_output_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 1)
        )
        
        self.attention_weights = None  # Store for visualization
    
    def forward(self, x):
        # x: (batch, seq_len, input_size)
        lstm_out, (h_n, c_n) = self.lstm(x)  # (batch, seq_len, hidden*directions)
        
        if self.use_attention:
            context, self.attention_weights = self.attention(lstm_out)
        else:
            # Use last hidden state
            context = lstm_out[:, -1, :]
        
        # Prediction
        output = self.fc(context)
        return output.squeeze(-1)

# Test model
n_features = X_train.shape[1]
model = LSTMPredictor(
    input_size=n_features,
    hidden_size=LSTM_CONFIG['hidden_size'],
    num_layers=LSTM_CONFIG['num_layers'],
    dropout=LSTM_CONFIG['dropout'],
    use_attention=LSTM_CONFIG['attention']
).to(device)

print(f"\n‚úÖ LSTM Model created:")
print(f"   Input features: {n_features}")
print(f"   Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(model)

## 5. Training Functions

In [None]:
def train_lstm_model(asset, config=LSTM_CONFIG, verbose=True):
    """Train LSTM model for a single asset."""
    
    # Load data
    X_train, X_val, X_test, y_train, y_val, y_test = load_asset_data(asset)
    
    # Select subset of features for LSTM (top 50 by variance)
    feature_var = X_train.var().sort_values(ascending=False)
    top_features = feature_var.head(50).index.tolist()
    
    X_train_sub = X_train[top_features].values
    X_val_sub = X_val[top_features].values
    X_test_sub = X_test[top_features].values
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_sub)
    X_val_scaled = scaler.transform(X_val_sub)
    X_test_scaled = scaler.transform(X_test_sub)
    
    # Create sequences
    seq_len = config['sequence_length']
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train.values, seq_len)
    X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val.values, seq_len)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test.values, seq_len)
    
    # Convert to tensors
    X_train_t = torch.FloatTensor(X_train_seq).to(device)
    y_train_t = torch.FloatTensor(y_train_seq).to(device)
    X_val_t = torch.FloatTensor(X_val_seq).to(device)
    y_val_t = torch.FloatTensor(y_val_seq).to(device)
    X_test_t = torch.FloatTensor(X_test_seq).to(device)
    y_test_t = torch.FloatTensor(y_test_seq).to(device)
    
    # DataLoaders
    train_dataset = TensorDataset(X_train_t, y_train_t)
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    
    # Model
    model = LSTMPredictor(
        input_size=len(top_features),
        hidden_size=config['hidden_size'],
        num_layers=config['num_layers'],
        dropout=config['dropout'],
        use_attention=config['attention']
    ).to(device)
    
    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    
    # Training loop
    best_val_loss = float('inf')
    patience_counter = 0
    train_losses = []
    val_losses = []
    
    for epoch in range(config['epochs']):
        # Training
        model.train()
        epoch_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            epoch_loss += loss.item()
        
        train_loss = epoch_loss / len(train_loader)
        train_losses.append(train_loss)
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_pred = model(X_val_t)
            val_loss = criterion(val_pred, y_val_t).item()
        val_losses.append(val_loss)
        
        scheduler.step(val_loss)
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else:
            patience_counter += 1
        
        if verbose and (epoch + 1) % 10 == 0:
            print(f"  Epoch {epoch+1}/{config['epochs']}: Train Loss={train_loss:.6f}, Val Loss={val_loss:.6f}")
        
        if patience_counter >= config['early_stopping_patience']:
            if verbose:
                print(f"  Early stopping at epoch {epoch+1}")
            break
    
    # Load best model
    model.load_state_dict(best_model_state)
    
    # Evaluate on test set
    model.eval()
    with torch.no_grad():
        test_pred = model(X_test_t).cpu().numpy()
    
    test_true = y_test_seq
    
    # Metrics
    rmse = np.sqrt(mean_squared_error(test_true, test_pred))
    mae = mean_absolute_error(test_true, test_pred)
    r2 = r2_score(test_true, test_pred)
    dir_acc = np.mean(np.sign(test_true) == np.sign(test_pred))
    corr = np.corrcoef(test_true, test_pred)[0, 1]
    
    return {
        'model': model,
        'scaler': scaler,
        'features': top_features,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'test_pred': test_pred,
        'test_true': test_true,
        'metrics': {
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'dir_acc': dir_acc,
            'corr': corr
        }
    }

print("‚úÖ Training function defined")

## 6. Train Models for All Assets

In [None]:
# Train LSTM for all assets
print("="*70)
print("LSTM TRAINING - All Assets")
print("="*70)

lstm_results = {}

for asset in ASSETS:
    print(f"\nüîÑ Training {asset}...")
    try:
        result = train_lstm_model(asset, verbose=False)
        lstm_results[asset] = result
        
        metrics = result['metrics']
        print(f"  ‚úÖ {asset}: Dir Acc={metrics['dir_acc']:.2%}, Corr={metrics['corr']:.4f}, RMSE={metrics['rmse']:.6f}")
    except Exception as e:
        print(f"  ‚ùå {asset}: {e}")

print(f"\n‚úÖ Trained {len(lstm_results)} LSTM models")

## 7. Compare LSTM vs XGBoost

In [None]:
# Load XGBoost results for comparison
xgb_results_path = BASE_DIR / "results" / "xgboost_walkforward" / "walkforward_summary.csv"

if xgb_results_path.exists():
    xgb_summary = pd.read_csv(xgb_results_path)
    
    # Create comparison DataFrame
    comparison = []
    
    for asset, result in lstm_results.items():
        lstm_metrics = result['metrics']
        
        # Find XGBoost metrics
        xgb_row = xgb_summary[xgb_summary['Asset'] == asset]
        if len(xgb_row) > 0:
            xgb_metrics = xgb_row.iloc[0]
            
            comparison.append({
                'Asset': asset,
                'LSTM_DirAcc': lstm_metrics['dir_acc'],
                'XGB_DirAcc': xgb_metrics['Test_DirAcc'],
                'LSTM_Corr': lstm_metrics['corr'],
                'XGB_Corr': xgb_metrics['Test_Corr'],
                'LSTM_RMSE': lstm_metrics['rmse'],
                'XGB_RMSE': xgb_metrics['Test_RMSE']
            })
    
    comparison_df = pd.DataFrame(comparison)
    
    print("\n" + "="*80)
    print("MODEL COMPARISON: LSTM vs XGBoost")
    print("="*80)
    print("\nüìã Performance by Asset:")
    print(comparison_df.to_string(index=False))
    
    print("\nüìä Average Performance:")
    print(f"   LSTM Dir Accuracy:   {comparison_df['LSTM_DirAcc'].mean():.2%}")
    print(f"   XGB Dir Accuracy:    {comparison_df['XGB_DirAcc'].mean():.2%}")
    print(f"   LSTM Correlation:    {comparison_df['LSTM_Corr'].mean():.4f}")
    print(f"   XGB Correlation:     {comparison_df['XGB_Corr'].mean():.4f}")
    
    # Visualization
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    x = np.arange(len(comparison_df))
    width = 0.35
    
    # Directional Accuracy
    axes[0].bar(x - width/2, comparison_df['LSTM_DirAcc'], width, label='LSTM', color='blue', alpha=0.7)
    axes[0].bar(x + width/2, comparison_df['XGB_DirAcc'], width, label='XGBoost', color='green', alpha=0.7)
    axes[0].axhline(y=0.5, color='red', linestyle='--', label='Random')
    axes[0].set_xticks(x)
    axes[0].set_xticklabels(comparison_df['Asset'], rotation=45)
    axes[0].set_ylabel('Directional Accuracy')
    axes[0].set_title('Directional Accuracy Comparison')
    axes[0].legend()
    
    # Correlation
    axes[1].bar(x - width/2, comparison_df['LSTM_Corr'], width, label='LSTM', color='blue', alpha=0.7)
    axes[1].bar(x + width/2, comparison_df['XGB_Corr'], width, label='XGBoost', color='green', alpha=0.7)
    axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    axes[1].set_xticks(x)
    axes[1].set_xticklabels(comparison_df['Asset'], rotation=45)
    axes[1].set_ylabel('Correlation')
    axes[1].set_title('Prediction Correlation Comparison')
    axes[1].legend()
    
    # Winner count
    lstm_wins_dir = (comparison_df['LSTM_DirAcc'] > comparison_df['XGB_DirAcc']).sum()
    xgb_wins_dir = (comparison_df['XGB_DirAcc'] > comparison_df['LSTM_DirAcc']).sum()
    lstm_wins_corr = (comparison_df['LSTM_Corr'] > comparison_df['XGB_Corr']).sum()
    xgb_wins_corr = (comparison_df['XGB_Corr'] > comparison_df['LSTM_Corr']).sum()
    
    wins = pd.DataFrame({
        'Metric': ['Dir Accuracy', 'Correlation'],
        'LSTM Wins': [lstm_wins_dir, lstm_wins_corr],
        'XGB Wins': [xgb_wins_dir, xgb_wins_corr]
    })
    
    axes[2].bar(['Dir Acc\nLSTM', 'Dir Acc\nXGB', 'Corr\nLSTM', 'Corr\nXGB'],
                [lstm_wins_dir, xgb_wins_dir, lstm_wins_corr, xgb_wins_corr],
                color=['blue', 'green', 'blue', 'green'], alpha=0.7)
    axes[2].set_ylabel('Number of Assets Won')
    axes[2].set_title('Model Wins by Metric')
    
    plt.tight_layout()
    plt.savefig(RESULTS_DIR / 'lstm_vs_xgboost_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    comparison_df.to_csv(RESULTS_DIR / 'lstm_vs_xgboost.csv', index=False)
    print(f"\n‚úÖ Comparison saved: {RESULTS_DIR / 'lstm_vs_xgboost_comparison.png'}")
else:
    print("‚ö†Ô∏è XGBoost results not found for comparison")

## 8. Attention Visualization

In [None]:
# Visualize attention weights
print("üìä Attention Weight Visualization")
print("="*60)

# Select an asset with attention
viz_asset = 'AAPL'
if viz_asset in lstm_results:
    result = lstm_results[viz_asset]
    model = result['model']
    
    # Get test data again for visualization
    X_train, X_val, X_test, y_train, y_val, y_test = load_asset_data(viz_asset)
    
    # Use same features
    X_test_sub = X_test[result['features']].values
    X_test_scaled = result['scaler'].transform(X_test_sub)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test.values, LSTM_CONFIG['sequence_length'])
    X_test_t = torch.FloatTensor(X_test_seq).to(device)
    
    # Get attention weights for last batch
    model.eval()
    with torch.no_grad():
        _ = model(X_test_t[-10:])  # Last 10 samples
    
    if model.attention_weights is not None:
        attn_weights = model.attention_weights.cpu().numpy()  # (batch, seq_len, 1)
        
        fig, axes = plt.subplots(2, 5, figsize=(20, 8))
        
        for i, ax in enumerate(axes.flatten()):
            if i < len(attn_weights):
                weights = attn_weights[i].squeeze()
                days = range(-LSTM_CONFIG['sequence_length']+1, 1)
                
                ax.bar(days, weights, color='blue', alpha=0.7)
                ax.set_xlabel('Days Ago')
                ax.set_ylabel('Attention Weight')
                ax.set_title(f'Sample {i+1}')
                ax.set_ylim(0, max(weights)*1.2)
        
        plt.suptitle(f'{viz_asset} - LSTM Attention Weights\n(Which past days the model focuses on)', fontsize=14)
        plt.tight_layout()
        plt.savefig(RESULTS_DIR / f'{viz_asset}_attention_weights.png', dpi=150, bbox_inches='tight')
        plt.show()
        
        # Average attention across samples
        avg_attention = attn_weights.mean(axis=0).squeeze()
        
        plt.figure(figsize=(12, 4))
        days = range(-LSTM_CONFIG['sequence_length']+1, 1)
        plt.bar(days, avg_attention, color='green', alpha=0.7)
        plt.xlabel('Days Ago')
        plt.ylabel('Average Attention Weight')
        plt.title(f'{viz_asset} - Average Attention Pattern\n(Model focuses more on recent days)')
        plt.grid(axis='y', alpha=0.3)
        plt.savefig(RESULTS_DIR / f'{viz_asset}_avg_attention.png', dpi=150, bbox_inches='tight')
        plt.show()
        
        print(f"‚úÖ Attention visualizations saved")
    else:
        print("‚ö†Ô∏è Attention weights not available")

## 9. Save Models and Results

In [None]:
# Save LSTM models and results
print("üíæ Saving LSTM Models and Results...")
print("="*60)

summary_data = []

for asset, result in lstm_results.items():
    # Save model
    model_path = OUTPUT_DIR / f'{asset}_lstm_model.pt'
    torch.save({
        'model_state_dict': result['model'].state_dict(),
        'features': result['features'],
        'config': LSTM_CONFIG
    }, model_path)
    
    # Save scaler
    scaler_path = OUTPUT_DIR / f'{asset}_scaler.joblib'
    joblib.dump(result['scaler'], scaler_path)
    
    # Summary
    metrics = result['metrics']
    summary_data.append({
        'Asset': asset,
        'Dir_Accuracy': metrics['dir_acc'],
        'Correlation': metrics['corr'],
        'RMSE': metrics['rmse'],
        'MAE': metrics['mae'],
        'R2': metrics['r2']
    })
    
    print(f"  ‚úÖ {asset} saved")

# Save summary
summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(RESULTS_DIR / 'lstm_summary.csv', index=False)

print(f"\n‚úÖ All models saved to: {OUTPUT_DIR}")
print(f"‚úÖ Summary saved to: {RESULTS_DIR / 'lstm_summary.csv'}")

print("\n" + "="*60)
print("LSTM TRAINING COMPLETE!")
print("="*60)
print(f"\nüìä Overall Performance:")
print(f"   Avg Directional Accuracy: {summary_df['Dir_Accuracy'].mean():.2%}")
print(f"   Avg Correlation: {summary_df['Correlation'].mean():.4f}")
print(f"   Avg RMSE: {summary_df['RMSE'].mean():.6f}")