# LSTM Training on Kaggle

This notebook trains LSTM models for stock price prediction using Kaggle's GPU resources.

## Setup
- **GPU**: T4/P100 recommended
- **Time**: 8-12 hours for all stocks
- **Batch Size**: 256-512 (optimized for Kaggle)
- **Output**: Trained models saved to Google Drive

In [None]:
# Cell 1: Setup Environment
!git clone https://github.com/yourusername/trading-system.git
%cd trading-system
!pip install -r requirements.txt

# Check GPU availability
import tensorflow as tf
print("GPU Available: ", tf.config.list_physical_devices('GPU'))

# Configure GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Configured {len(gpus)} GPU(s) for memory growth")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")

# Enable mixed precision for faster training
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)
print("Mixed precision enabled")

In [None]:
# Cell 2: Mount Google Drive and Setup Paths
from google.colab import drive
drive.mount('/content/drive')

import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime

# Add source code to path
sys.path.append('/kaggle/working/trading-system/src')

# Define paths
data_path = '/content/drive/MyDrive/TradingSystem/data/processed'
models_path = '/content/drive/MyDrive/TradingSystem/models/lstm'
logs_path = '/content/drive/MyDrive/TradingSystem/logs'

# Create directories if they don't exist
os.makedirs(models_path, exist_ok=True)
os.makedirs(logs_path, exist_ok=True)

print(f"Data path: {data_path}")
print(f"Models path: {models_path}")
print(f"Logs path: {logs_path}")

In [None]:
# Cell 3: Load Processed Data
# Check available processed files
if os.path.exists(data_path):
    processed_files = [f for f in os.listdir(data_path) if f.endswith('_processed.csv')]
    print(f"Found {len(processed_files)} processed files")
    
    # Display first few files
    for i, file in enumerate(processed_files[:10]):
        print(f"{i+1}. {file}")
    
    if len(processed_files) > 10:
        print(f"... and {len(processed_files) - 10} more files")
else:
    print(f"Data path does not exist: {data_path}")
    print("Please upload your processed data to Google Drive first")

# Load data for training
training_data = {}
failed_loads = []

for file in processed_files:
    try:
        symbol = file.replace('_processed.csv', '')
        df = pd.read_csv(os.path.join(data_path, file))
        
        if len(df) > 100:  # Only include stocks with sufficient data
            training_data[symbol] = df
            print(f"✓ Loaded {symbol}: {len(df)} records")
        else:
            print(f"⚠ Skipped {symbol}: insufficient data ({len(df)} records)")
            
    except Exception as e:
        failed_loads.append((file, str(e)))
        print(f"✗ Failed to load {file}: {e}")

print(f"\nSuccessfully loaded {len(training_data)} stocks for training")
if failed_loads:
    print(f"Failed to load {len(failed_loads)} files")

In [None]:
# Cell 4: Import Trading System Modules
try:
    from models.lstm_model import LSTMModel
    from models.lstm_trainer import LSTMTrainer
    print("✓ Successfully imported trading system modules")
except ImportError as e:
    print(f"✗ Import error: {e}")
    print("Falling back to simplified LSTM implementation")
    
    # Simplified LSTM implementation for Kaggle
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split
    
    def create_lstm_model(input_shape):
        """Create optimized LSTM model for Kaggle training"""
        model = Sequential([
            LSTM(128, return_sequences=True, input_shape=input_shape),
            Dropout(0.2),
            LSTM(64, return_sequences=True),
            Dropout(0.2),
            LSTM(32),
            Dropout(0.2),
            Dense(16, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    print("✓ Simplified LSTM implementation ready")

In [None]:
# Cell 5: Training Configuration
# Kaggle-optimized training parameters
TRAINING_CONFIG = {
    'batch_size': 256,  # Optimized for Kaggle T4/P100
    'epochs': 50,       # Reduced for faster training
    'sequence_length': 60,
    'validation_split': 0.2,
    'patience': 10,     # Early stopping patience
    'lr_patience': 5,   # Learning rate reduction patience
}

# Detect GPU type and adjust batch size
gpu_name = tf.config.experimental.get_device_details(gpus[0])['device_name'] if gpus else 'CPU'
print(f"Detected GPU: {gpu_name}")

if 'V100' in gpu_name:
    TRAINING_CONFIG['batch_size'] = 512
    print("Optimized for V100: batch_size = 512")
elif 'P100' in gpu_name:
    TRAINING_CONFIG['batch_size'] = 384
    print("Optimized for P100: batch_size = 384")
elif 'T4' in gpu_name:
    TRAINING_CONFIG['batch_size'] = 256
    print("Optimized for T4: batch_size = 256")
else:
    TRAINING_CONFIG['batch_size'] = 128
    print("Conservative batch_size = 128")

print(f"Training configuration: {TRAINING_CONFIG}")

In [None]:
# Cell 6: Data Preparation Function
def prepare_lstm_data(df, sequence_length=60):
    """Prepare data for LSTM training"""
    try:
        # Select features (adjust based on your processed data columns)
        feature_columns = [
            'close', 'volume', 'rsi_14', 'sma_50', 'sma_200',
            'macd', 'macd_signal', 'bb_upper', 'bb_lower', 'atr_14'
        ]
        
        # Filter available columns
        available_features = [col for col in feature_columns if col in df.columns]
        
        if not available_features:
            print(f"No feature columns found. Available columns: {df.columns.tolist()}")
            return None, None, None, None
        
        # Prepare features and target
        features = df[available_features].values
        target = df['target'].values if 'target' in df.columns else (df['close'].shift(-1) > df['close']).astype(int).values[:-1]
        
        # Scale features
        scaler = MinMaxScaler()
        features_scaled = scaler.fit_transform(features)
        
        # Create sequences
        X, y = [], []
        for i in range(sequence_length, len(features_scaled) - 1):
            X.append(features_scaled[i-sequence_length:i])
            y.append(target[i])
        
        X, y = np.array(X), np.array(y)
        
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        return X_train, X_test, y_train, y_test
        
    except Exception as e:
        print(f"Error preparing data: {e}")
        return None, None, None, None

print("Data preparation function ready")

In [None]:
# Cell 7: Training Loop
training_results = {}
training_start_time = datetime.now()

print(f"Starting LSTM training for {len(training_data)} stocks...")
print(f"Training started at: {training_start_time}")

for i, (symbol, df) in enumerate(training_data.items()):
    print(f"\n{'='*60}")
    print(f"Training {i+1}/{len(training_data)}: {symbol}")
    print(f"{'='*60}")
    
    try:
        # Prepare data
        X_train, X_test, y_train, y_test = prepare_lstm_data(df, TRAINING_CONFIG['sequence_length'])
        
        if X_train is None:
            print(f"⚠ Skipping {symbol}: data preparation failed")
            continue
        
        print(f"Data shape - Train: {X_train.shape}, Test: {X_test.shape}")
        
        # Create model
        model = create_lstm_model((X_train.shape[1], X_train.shape[2]))
        
        # Setup callbacks
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                patience=TRAINING_CONFIG['patience'],
                restore_best_weights=True,
                verbose=1
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                patience=TRAINING_CONFIG['lr_patience'],
                factor=0.5,
                verbose=1
            )
        ]
        
        # Train model
        history = model.fit(
            X_train, y_train,
            batch_size=TRAINING_CONFIG['batch_size'],
            epochs=TRAINING_CONFIG['epochs'],
            validation_split=TRAINING_CONFIG['validation_split'],
            callbacks=callbacks,
            verbose=1
        )
        
        # Evaluate model
        test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
        
        # Save model
        model_path = os.path.join(models_path, f'{symbol}_model.keras')
        model.save(model_path)
        
        # Store results
        training_results[symbol] = {
            'test_accuracy': float(test_accuracy),
            'test_loss': float(test_loss),
            'epochs_trained': len(history.history['loss']),
            'best_val_accuracy': float(max(history.history['val_accuracy'])),
            'model_path': model_path
        }
        
        print(f"✓ {symbol} - Test Accuracy: {test_accuracy:.4f}, Loss: {test_loss:.4f}")
        
        # Clear memory
        del model, X_train, X_test, y_train, y_test
        tf.keras.backend.clear_session()
        
    except Exception as e:
        print(f"✗ Error training {symbol}: {e}")
        training_results[symbol] = {'error': str(e)}
        
        # Clear memory on error
        tf.keras.backend.clear_session()
        
    # Progress update
    elapsed_time = datetime.now() - training_start_time
    avg_time_per_stock = elapsed_time / (i + 1)
    remaining_stocks = len(training_data) - (i + 1)
    estimated_remaining = avg_time_per_stock * remaining_stocks
    
    print(f"Progress: {i+1}/{len(training_data)} ({(i+1)/len(training_data)*100:.1f}%)")
    print(f"Elapsed: {elapsed_time}, Estimated remaining: {estimated_remaining}")

training_end_time = datetime.now()
total_training_time = training_end_time - training_start_time

print(f"\n{'='*60}")
print(f"TRAINING COMPLETED!")
print(f"{'='*60}")
print(f"Total time: {total_training_time}")
print(f"Successfully trained: {len([r for r in training_results.values() if 'error' not in r])} models")
print(f"Failed: {len([r for r in training_results.values() if 'error' in r])} models")

In [None]:
# Cell 8: Save Training Results
import json

# Prepare summary
summary = {
    'training_start': training_start_time.isoformat(),
    'training_end': training_end_time.isoformat(),
    'total_duration': str(total_training_time),
    'total_stocks': len(training_data),
    'successful_models': len([r for r in training_results.values() if 'error' not in r]),
    'failed_models': len([r for r in training_results.values() if 'error' in r]),
    'training_config': TRAINING_CONFIG,
    'gpu_info': gpu_name,
    'results': training_results
}

# Save to Google Drive
results_file = os.path.join(logs_path, f'lstm_training_results_{training_start_time.strftime("%Y%m%d_%H%M%S")}.json')
with open(results_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Training results saved to: {results_file}")

# Display summary statistics
successful_results = [r for r in training_results.values() if 'error' not in r]
if successful_results:
    accuracies = [r['test_accuracy'] for r in successful_results]
    print(f"\nAccuracy Statistics:")
    print(f"Mean: {np.mean(accuracies):.4f}")
    print(f"Std: {np.std(accuracies):.4f}")
    print(f"Min: {np.min(accuracies):.4f}")
    print(f"Max: {np.max(accuracies):.4f}")

print(f"\nAll models saved to: {models_path}")
print(f"Training completed successfully! 🎉")