# Collect and Process Data for Stock Trends from yFinance

In [8]:
import yfinance as yf

In [9]:
yf.__version__

'0.2.61'

In [11]:
df_ = yf.download(['AAPL','TSM'], start="2024-01-01")

[*********************100%***********************]  2 of 2 completed


In [12]:
df_

Price,Close,Close,High,High,Low,Low,Open,Open,Volume,Volume
Ticker,AAPL,TSM,AAPL,TSM,AAPL,TSM,AAPL,TSM,AAPL,TSM
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2024-01-02,184.290421,99.724220,187.070068,100.794838,182.553143,98.800936,185.789438,100.431416,82488700,9020900
2024-01-03,182.910522,98.388412,184.528677,99.449206,182.096477,97.887489,182.880742,98.555387,58414500,6650600
2024-01-04,180.587555,97.366905,181.758969,98.653611,179.565044,97.357088,180.825800,97.759792,71983600,7996700
2024-01-05,179.862839,97.838371,181.431354,98.850051,178.860187,97.042780,180.666963,97.239220,62303300,7344900
2024-01-08,184.210999,100.421593,184.250716,100.961814,180.180517,98.712542,180.766224,98.840227,59144500,12455600
...,...,...,...,...,...,...,...,...,...,...
2025-05-22,201.360001,196.190002,202.750000,196.830002,199.699997,191.339996,200.710007,191.339996,46742400,11825000
2025-05-23,195.270004,191.979996,197.699997,192.800003,193.460007,190.029999,193.669998,192.020004,78432900,9403100
2025-05-27,200.210007,197.679993,200.740005,198.309998,197.429993,193.699997,198.300003,194.100006,56288500,12228200
2025-05-28,200.419998,196.139999,202.729996,198.070007,199.899994,195.479996,200.589996,196.089996,45339700,17787600


In [17]:
import get_all_tickers

# A real list of 300 would require extensive data scraping and cleaning.

top_global_tickers = [
    # US Tech Giants
    'AAPL', 'MSFT', 'GOOG', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'ADBE', 'CRM', 'NFLX', 'INTC', 'CSCO',
    # US Blue Chips/Diversified
    'JPM', 'XOM', 'JNJ', 'PG', 'V', 'MA', 'UNH', 'HD', 'KO', 'PEP', 'DIS', 'BAC', 'WMT', 'BRK-B', 'LLY', 'AVGO', 'ORCL',
    # European Giants (examples - remember exchange suffixes)
    'ASML.AS', 'SAP.DE', 'RMS.PA', 'SIE.DE', 'HSBA.L', 'BP.L', 'SHEL.L', 'VOD.L', 'DAI.DE', 'BNP.PA', 'OR.PA', 'NOVN.SW', 'ROG.SW',
    'LVMUY', # LVMH Moët Hennessy Louis Vuitton SE (ADR)
    'NVO', # Novo Nordisk A/S (ADR)
    'SMNEY', # Siemens AG (ADR)
    # Asian Giants (examples - remember exchange suffixes)
    '7203.T', '9984.T', '8058.T', '0005.HK', '0700.HK', '0001.HK', '2330.TW', '2454.TW', '005930.KS', '000660.KS',
    'RELIANCE.NS', 'TCS.NS', 'BABA', 'TCEHY',
    '2222.SR', # Saudi Aramco
    'ICBC.HK', # Industrial and Commercial Bank of China
    '601398.SS', # ICBC (Shanghai)
    # Canadian (examples)
    'RY', 'TD', 'ENB',
    # Australian (examples)
    'CBA.AX', 'BHP.AX', 'ANZ.AX',
    # Add more from various regions and sectors to reach 300
    # ... this list would be much longer ...
]

# You would extend top_global_tickers significantly to reach 300.
# A programmatic way using 'get-all-tickers' would be more efficient:

try:
    from get_all_tickers import get_biggest_n_tickers, Region, SectorConstants
    # This might give you a good starting point for the top N global companies
    # Note: 'get_biggest_n_tickers' might prioritize US listings or those readily available.
    # You might need to combine it with regional searches.
    top_300_from_library = get_biggest_n_tickers(300)
    print(f"Fetched {len(top_300_from_library)} tickers using get_biggest_n_tickers.")
    print(top_300_from_library[:10]) # Print first 10 for review

    # Example of getting tickers by region (you'd need to combine these)
    # asia_tickers = get_tickers_by_region(Region.ASIA)
    # print(f"Fetched {len(asia_tickers)} tickers from Asia.")

    # Let's use the programmatically generated list for fetching data
    final_tickers_to_fetch = top_300_from_library

except ImportError:
    print("The 'get-all-tickers' library is not installed. Please install it using 'pip install get-all-tickers'.")
    print("Proceeding with a very small, hardcoded example list for demonstration.")
    final_tickers_to_fetch = ['AAPL', 'MSFT', 'GOOG', '2330.TW', 'HSBA.L'] # Fallback for demonstration

# Now, fetch data using yfinance
print(f"\nAttempting to fetch data for {len(final_tickers_to_fetch)} tickers using yfinance...")

data = {}
for ticker_symbol in final_tickers_to_fetch:
    try:
        ticker = yf.Ticker(ticker_symbol)
        # Fetch current price or a small amount of history to confirm it works
        hist = ticker.history(period="1d") # Get last day's data
        if not hist.empty:
            data[ticker_symbol] = hist
            print(f"Successfully fetched data for {ticker_symbol}")
        else:
            print(f"No data returned for {ticker_symbol}. It might be an invalid ticker or no recent trading.")
    except Exception as e:
        print(f"Error fetching data for {ticker_symbol}: {e}")

print("\n--- Summary of fetched data ---")
for ticker, df in data.items():
    print(f"Ticker: {ticker}, Data Points: {len(df)}")

The 'get-all-tickers' library is not installed. Please install it using 'pip install get-all-tickers'.
Proceeding with a very small, hardcoded example list for demonstration.

Attempting to fetch data for 5 tickers using yfinance...
Successfully fetched data for AAPL
Successfully fetched data for MSFT
Successfully fetched data for GOOG
Successfully fetched data for 2330.TW
Successfully fetched data for HSBA.L

--- Summary of fetched data ---
Ticker: AAPL, Data Points: 1
Ticker: MSFT, Data Points: 1
Ticker: GOOG, Data Points: 1
Ticker: 2330.TW, Data Points: 1
Ticker: HSBA.L, Data Points: 1


# Time-Series Model

((619025, 15, 4), (619025,))

(array([[0.00658545, 0.00649954, 0.0064565 , 0.01359914],
        [0.00649732, 0.0064463 , 0.00627456, 0.01436664],
        [0.00628189, 0.00620433, 0.00619588, 0.01314381],
        [0.00620844, 0.00641243, 0.00626964, 0.01659475],
        [0.0065218 , 0.00642211, 0.00573365, 0.05156577],
        [0.00602728, 0.00625272, 0.00611228, 0.02527831],
        [0.00622313, 0.00622852, 0.00618604, 0.01836575],
        [0.00614479, 0.00608334, 0.00572873, 0.02381803],
        [0.0058755 , 0.00593331, 0.00560579, 0.01928401],
        [0.00585102, 0.00576393, 0.00575823, 0.0098205 ],
        [0.00586571, 0.00584136, 0.00565497, 0.01162401],
        [0.00564048, 0.00567681, 0.00550745, 0.01523524],
        [0.00570903, 0.0057736 , 0.00574348, 0.01195414],
        [0.00581185, 0.00577844, 0.00584675, 0.00993728],
        [0.00575309, 0.00593331, 0.00581232, 0.01193198]]),
 np.float64(0.006012474296794487))

# Add Early Stopping, Model Saving, and Model Checkpointing

In [163]:
import torch
import torch.nn as nn
import numpy as np
import os
from datetime import datetime

# Early Stopping Class
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.01, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1

        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False

    def save_checkpoint(self, model):
        self.best_weights = model.state_dict().copy()



In [164]:
# Model Saving Utilities
def save_model(model, optimizer, epoch, loss, filepath):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'timestamp': datetime.now().isoformat()
    }
    torch.save(checkpoint, filepath)
    print(f"Model saved to {filepath}")

def load_model(model, optimizer, filepath):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    print(f"Model loaded from {filepath}")
    print(f"Resumed from epoch {epoch}, loss: {loss:.6f}")
    return epoch, loss

In [165]:
def make_prediction(model, X_data, scaler, features, target, sequence_length=30):
    """
    Make a prediction using the trained model.

    Parameters:
    - model: Trained LSTM model
    - X_data: Input data for prediction
    - scaler: Scaler used for normalization
    - features: List of feature names
    - target: Target variable name
    - sequence_length: Length of the input sequence

    Returns:
    - Predicted value for the target variable
    """
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        if len(X_data.shape) == 3:
            # If X_data is already in the correct shape, no need to reshape
            last_sequence = torch.tensor(X_data[-1], dtype=torch.float32).unsqueeze(0)
        else:
            # If X_data is 2D, reshape it to match the expected input shape
            last_sequence = torch.tensor(X_data[-sequence_length:], dtype=torch.float32).unsqueeze(0)

        print(f"Input shape for prediction: {last_sequence.shape}")

        # Maker prediction
        # Prepare input data
        predicted_norm = model(last_sequence).item()
        print(f"Predicted norm for prediction: {predicted_norm}")


        # Inverse normalize the predicted value
        # Create a dummy array to hold the last sequence and the predicted value
        dummy = np.zeros((1, len(features) + 1)) # +1 for the target column
        dummy[0, :-1] = last_sequence.numpy().flatten() # fill with the last sequence values

        # Inverse transform the predicted value
        predicted_close = scaler.inverse_transform(dummy)[0, -1]

        return predicted_close, predicted_norm

In [None]:
# Enhannced training with early stopping and model savling
def train_model_with_enhancements(model, train_loader, val_loader, X_data, scaler, features, num_epochs=100, patience=10, save_dir="models"):
    # create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    # Initialize training components
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    scheduler = torch.optim.lr_scheduler.RedduceLROnPlateau(
        optimizer, mode="min", factor=0.5, patience=3
    )
    # Early stopping
    early_stopping = EarlyStopping(patience=patience, min_delta=1e-6)

    # Training loop
    train_losses = []
    val_losses = []

    print("Starting enhanced training with early stopping...")
    print(f"Model will be saved to {save_dir}")

    best_val_loss = float("inf")

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_epoch_losses = []
        train_nan_batches = 0

        for batch_idx, (xb, yb) in enumerate(train_loader):
            # Use safe training step
            loss_value = safe_training_step(model, criterion, optimizer, xb, yb)

            if loss_value is None:
                train_nan_batches += 1
                continue
            train_epoch_losses.append(loss_value)

        # Validation phase
        model.val()
        val_epoch_losses = []
        val_nan_batches = 0

        with torch.no_grad():
            for xb, yb in val_loader:
                if torch.isnan(xb).any() or torch.isnan(yb).any():
                    val_nan_batches += 1
                    continue


                pred = model(xb)
                if torch.isnan(pred).any():
                    val_nan_batches += 1
                    continue

                loss = criterion(pred, yb)
                if not torch.isnan(loss):
                    val_epoch_losses.append(loss.item())

        # Calculate average losses
        if train_epoch_losses and val_epoch_losses:
            avg_train_loss = np.mean(train_epoch_losses)
            avg_val_loss = np.mean(val_epoch_losses)

            train_losses.append(avg_train_loss)
            val_losses.append(avg_val_loss)

            # Update learning rate
            scheduler.step(avg_val_loss)
            current_lr = optimizer.param_groups[0]['lr']

            # print progress
            print(f"Epoch {epoch+1}/{num_epochs}")
            print(f" Train Loss: {avg_train_loss: .6f}")
            print(f" Val Loss: {avg_val_loss: .6f}")
            print(f" Learning Rate: {current_lr:.2e}")

            if train_nan_batches > 0:
                print(f" Skipped train batches: {train_nan_batches}")
            if val_nan_batches > 0:
                print(f" Skipped val batches: {val_nan_batches}")

            # Save the best model
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_model_path = os.path.join(save_dir, "best_model.pth")
                save_model(model, optimizer, epoch, avg_val_loss, best_model_path)

                # Make prediction with best model
                try:
                    predicted_price, norm_pred = make_prediction(model, X_data, scaler, features)
                    print(f" Prediction with best model: ${predicted_price:.2f}")
                except Exception as e:
                    print(f" Prediction failed: {e}")

            # Early stopping check
            if early_stopping(avg_val_loss, model):
                print(f"\nEarly stopping triggered after {epoch+1} epochs")
                print(f"Best validation loss: {best_val_loss:.6f}")
                break
        else:
            print(f"Epoch {epoch+1}/{num_epochs}: All batches failed.")
            break

        print("-"*60)

    # Save final model
    final_model_path = os.path.join(save_dir, "final_model.pth")
    if train_losses:
        save_model(model, optimizer, epoch, train_losses[-1], final_model_path)

    if os.path.exists(os.path.join(save_dir, "best_model.pth")):
        load_model(model, optimizer, os.path.join(save_dir, "best_model.pth"))

    return train_losses, val_losses, best_val_loss



In [173]:
import torch
import torch.nn as nn
import numpy as np
import os
from datetime import datetime

# Early Stopping Class
class EarlyStopping:
    def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1

        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False

    def save_checkpoint(self, model):
        """Save model when validation loss decreases."""
        self.best_weights = model.state_dict().copy()

# Model saving utilities
def save_model(model, optimizer, epoch, loss, filepath, scheduler=None):
    """Save complete model checkpoint"""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'timestamp': datetime.now().isoformat()
    }
    if scheduler is not None:
        checkpoint['scheduler_state_dict'] = scheduler.state_dict()

    torch.save(checkpoint, filepath)
    print(f"✅ Model saved to {filepath}")

def load_model(model, optimizer, filepath, scheduler=None):
    """Load model checkpoint"""
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    if scheduler is not None and 'scheduler_state_dict' in checkpoint:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    print(f"✅ Model loaded from {filepath}")
    print(f"   Resumed from epoch {epoch}, loss: {loss:.6f}")
    return epoch, loss

# Create model with proper parameter order
model = LSTMModel(
    input_size=len(features),
    output_size=1,  # Specify output_size explicitly
    hidden_size=64,
    num_layers=2,
    dropout=0.2  # Optional: add dropout for better stability
)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Add learning rate scheduler for better training stability
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3
)

# Initialize early stopping
early_stopping = EarlyStopping(patience=7, min_delta=1e-6)

# Create directories for saving models
save_dir = 'model_checkpoints'
os.makedirs(save_dir, exist_ok=True)

# Training history tracking
train_losses = []
best_loss = float('inf')
start_epoch = 0

# Check if there's a checkpoint to resume from
checkpoint_path = os.path.join(save_dir, 'latest_checkpoint.pth')
if os.path.exists(checkpoint_path):
    print(f"Found checkpoint at {checkpoint_path}")
    try:
        start_epoch, _ = load_model(model, optimizer, checkpoint_path, scheduler)
        start_epoch += 1  # Start from next epoch
        print(f"Resuming training from epoch {start_epoch}")
    except Exception as e:
        print(f"Failed to load checkpoint: {e}")
        print("Starting training from scratch...")
        start_epoch = 0

print("Starting enhanced training with NaN-safe LSTM model...")
print(f"Models will be saved to: {save_dir}")
print(f"Training from epoch {start_epoch + 1} to {start_epoch + 10}")

for epoch in range(start_epoch, start_epoch + 10):
    model.train()  # Set model to training mode
    epoch_losses = []
    nan_batches = 0

    print(f"\n📊 Epoch {epoch+1}/{start_epoch + 10}")

    for batch_idx, (xb, yb) in enumerate(dataloader):
        # Input validation (the model will also check this)
        if torch.isnan(xb).any() or torch.isnan(yb).any():
            print(f"⚠️  Warning: NaN in input data at batch {batch_idx}, skipping...")
            nan_batches += 1
            continue

        # Use the safe training step function
        loss_value = safe_training_step(model, criterion, optimizer, xb, yb)

        if loss_value is None:
            print(f"⚠️  Warning: Training step failed at batch {batch_idx}")
            nan_batches += 1
            continue

        epoch_losses.append(loss_value)

        # Optional: Print progress every 100 batches
        if batch_idx % 100 == 0 and batch_idx > 0:
            current_avg = sum(epoch_losses[-100:]) / min(100, len(epoch_losses))
            print(f"   Batch {batch_idx}: Current Loss: {loss_value:.6f}, Avg(last 100): {current_avg:.6f}")

    # Calculate average loss for the epoch
    if epoch_losses:
        avg_loss = sum(epoch_losses) / len(epoch_losses)
        train_losses.append(avg_loss)

        print(f"✅ Epoch {epoch+1} completed:")
        print(f"   Average Loss: {avg_loss:.6f}")
        print(f"   Processed batches: {len(epoch_losses)}")
        if nan_batches > 0:
            print(f"   Skipped batches (NaN): {nan_batches}")

        # Update learning rate scheduler
        scheduler.step(avg_loss)
        current_lr = optimizer.param_groups[0]['lr']
        print(f"   Current Learning Rate: {current_lr:.2e}")

        # Save best model
        if avg_loss < best_loss:
            best_loss = avg_loss
            best_model_path = os.path.join(save_dir, 'best_model.pth')
            save_model(model, optimizer, epoch, avg_loss, best_model_path, scheduler)
            print(f"   🏆 New best model! Loss: {avg_loss:.6f}")

            # Try to make a prediction with the best model
            try:
                model.eval()
                with torch.no_grad():
                    # Get a sample from the dataloader for prediction demo
                    sample_batch = next(iter(dataloader))
                    sample_x, sample_y = sample_batch
                    if not torch.isnan(sample_x).any():
                        pred = model(sample_x[:1])  # Predict for first sample
                        print(f"   🎯 Sample prediction: {pred.item():.6f} (target: {sample_y[0].item():.6f})")
                model.train()
            except Exception as e:
                print(f"   ⚠️  Prediction demo failed: {e}")

        # Save checkpoint every epoch
        checkpoint_path = os.path.join(save_dir, 'latest_checkpoint.pth')
        save_model(model, optimizer, epoch, avg_loss, checkpoint_path, scheduler)

        # Check model health
        if not check_model_weights(model):
            print("❌ ERROR: Model weights corrupted with NaN/Inf. Stopping training.")
            break

        # Early stopping check
        if early_stopping(avg_loss, model):
            print(f"\n🛑 Early stopping triggered after epoch {epoch+1}")
            print(f"   Best loss achieved: {best_loss:.6f}")
            break

    else:
        print(f"❌ Epoch {epoch+1}: All batches failed! Stopping training.")
        break

    print("-" * 60)

print("\n🎉 Training completed!")

# Load best model for final evaluation
best_model_path = os.path.join(save_dir, 'best_model.pth')
if os.path.exists(best_model_path):
    print("\n🔄 Loading best model for final evaluation...")
    load_model(model, optimizer, best_model_path, scheduler)

# Final model evaluation
print("\n📈 Final model evaluation...")
model.eval()
with torch.no_grad():
    total_loss = 0
    num_batches = 0
    predictions = []
    targets = []

    for xb, yb in dataloader:
        if torch.isnan(xb).any() or torch.isnan(yb).any():
            continue

        pred = model(xb)
        if not torch.isnan(pred).any():
            loss = criterion(pred, yb)
            if not torch.isnan(loss):
                total_loss += loss.item()
                num_batches += 1

                # Store some predictions for analysis
                if len(predictions) < 10:
                    predictions.extend(pred.cpu().numpy().flatten()[:5])
                    targets.extend(yb.cpu().numpy().flatten()[:5])

    if num_batches > 0:
        final_avg_loss = total_loss / num_batches
        print(f"✅ Final average validation loss: {final_avg_loss:.6f}")
        print(f"   Evaluated on {num_batches} batches")

        # Show some sample predictions vs targets
        if predictions and targets:
            print("\n📊 Sample Predictions vs Targets:")
            for i, (pred, target) in enumerate(zip(predictions[:5], targets[:5])):
                diff = abs(pred - target)
                print(f"   Sample {i+1}: Pred={pred:.6f}, Target={target:.6f}, Diff={diff:.6f}")
    else:
        print("❌ No valid batches for final evaluation")

# Check final model state
if check_model_weights(model):
    print("✅ Final model weights are healthy")
else:
    print("❌ Final model weights contain NaN/Inf")

# Training summary
print(f"\n📋 Training Summary:")
print(f"   Total epochs completed: {len(train_losses)}")
print(f"   Best training loss: {min(train_losses) if train_losses else 'N/A':.6f}")
print(f"   Final training loss: {train_losses[-1] if train_losses else 'N/A':.6f}")
print(f"   Models saved in: {save_dir}")

# Save final model
final_model_path = os.path.join(save_dir, 'final_model.pth')
if train_losses:
    save_model(model, optimizer, len(train_losses)-1, train_losses[-1], final_model_path, scheduler)

print(f"\n💾 Saved models:")
print(f"   • Best model: {os.path.join(save_dir, 'best_model.pth')}")
print(f"   • Final model: {final_model_path}")
print(f"   • Latest checkpoint: {os.path.join(save_dir, 'latest_checkpoint.pth')}")

# Prediction function for future use
def make_prediction_with_saved_model(model_path, input_data, scaler=None, features=None):
    """
    Make predictions using a saved model

    Args:
        model_path: Path to saved model
        input_data: Input sequence for prediction
        scaler: Optional scaler for inverse transform
        features: Optional feature list for scaler
    """
    # Load model
    checkpoint = torch.load(model_path)

    # Create new model instance (you'll need to adjust parameters as needed)
    pred_model = LSTMModel(
        input_size=input_data.shape[-1] if len(input_data.shape) > 1 else len(features),
        output_size=1,
        hidden_size=64,
        num_layers=2,
        dropout=0.2
    )

    pred_model.load_state_dict(checkpoint['model_state_dict'])
    pred_model.eval()

    with torch.no_grad():
        if not isinstance(input_data, torch.Tensor):
            input_data = torch.tensor(input_data, dtype=torch.float32)

        if len(input_data.shape) == 2:
            input_data = input_data.unsqueeze(0)  # Add batch dimension

        prediction = pred_model(input_data).item()

        # Inverse transform if scaler provided
        if scaler is not None and features is not None:
            dummy = np.zeros((1, len(features) + 1))
            dummy[0, -1] = prediction
            actual_prediction = scaler.inverse_transform(dummy)[0, -1]
            return actual_prediction, prediction

        return prediction

print(f"\n🎯 To make predictions later, use:")
print(f"   prediction = make_prediction_with_saved_model('{best_model_path}', your_input_data)")
print(f"   # Or with inverse scaling:")
print(f"   actual_price, norm_pred = make_prediction_with_saved_model('{best_model_path}', your_input_data, scaler, features)")

Starting enhanced training with NaN-safe LSTM model...
Models will be saved to: model_checkpoints
Training from epoch 1 to 10

📊 Epoch 1/10
   Batch 100: Current Loss: 0.601820, Avg(last 100): 0.589328
   Batch 200: Current Loss: 0.288965, Avg(last 100): 0.483820
   Batch 300: Current Loss: 0.455564, Avg(last 100): 0.467640
   Batch 400: Current Loss: 0.407235, Avg(last 100): 0.411500
   Batch 500: Current Loss: 0.822177, Avg(last 100): 0.405119
   Batch 600: Current Loss: 0.402526, Avg(last 100): 0.410800
   Batch 700: Current Loss: 0.300243, Avg(last 100): 0.363766
   Batch 800: Current Loss: 0.172783, Avg(last 100): 0.323188
   Batch 900: Current Loss: 0.192158, Avg(last 100): 0.314985
   Batch 1000: Current Loss: 0.160059, Avg(last 100): 0.247663
   Batch 1100: Current Loss: 0.392816, Avg(last 100): 0.282599
   Batch 1200: Current Loss: 0.276252, Avg(last 100): 0.225787
   Batch 1300: Current Loss: 0.144767, Avg(last 100): 0.227757
   Batch 1400: Current Loss: 0.057898, Avg(last 10

In [172]:
# 1. Create your model
model = LSTMModel(input_size=len(features), output_size=1, hidden_size=64, num_layers=2)

# 2. Train with all enhancements
train_losses, val_losses, best_loss = train_model_with_enhancements(
    model=model,
    train_loader=train_dataloader,
    val_loader=val_dataloader,
    X_data=X,  # Your full feature array
    scaler=scaler,  # Your fitted scaler
    features=features,  # List of feature names
    num_epochs=100,
    patience=10,  # Stop if no improvement for 10 epochs
    save_dir='models'  # Where to save models
)

# 3. Make predictions
predicted_price, norm_pred = make_prediction(model, X, scaler, features)
print(f"Predicted close price for the next day: ${predicted_price:.2f}")

NameError: name 'train_dataloader' is not defined

In [189]:
input = [0.0623, 0.0359, 0.0427, 0.0682]
input = pd.DataFrame(input)


input_scaled = scaler.fit_transform(input)
input_scaled.shape


(4, 1)

# Predict with trained model

In [186]:
# Load and predict
actual_price, norm_pred = make_prediction_with_saved_model(
    'model_checkpoints/best_model.pth',
    input_sequence,
    scaler,
    features
)
print(f"Predicted close price for the next day: ${actual_price:.2f}")

RuntimeError: Error(s) in loading state_dict for LSTMModel:
	size mismatch for lstm1.weight_ih_l0: copying a param with shape torch.Size([256, 4]) from checkpoint, the shape in current model is torch.Size([256, 1]).

# Make Predictions of the next day

In [None]:
# Predict the next day using the last 30 days of data
last_sequence = torch.tensor(X[-1], dtype=torch.float32)
predicted_norm = model(last_sequence).item()

# Inverse normalize the predicted value
dummy = np.zeros((1, len(features) + 1))
dummy[0, -1] = predicted_norm
predicted_close = scaler.inverse_transform(dummy)[0, -1]
print(f"Predicted close price for the next day: {predicted_close:.2f}")

In [62]:
import requests
from collections import defaultdict

POLYGON_APIKEY = "SnsJtDM9NI3nlgJXmYpcWdeHo7g_u2Xb"
exchange_to_country = {
    "XNAS": "US",
    "XNYS": "US",
    "XTKS": "Japan",
    "XTAI": "Taiwan",
    "XSHG": "China",
    "XHKG": "Hong Kong"
}

# Step 1: Get tickers
url = "https://api.polygon.io/v3/reference/tickers"
params = {
    "apiKey": POLYGON_APIKEY,
    "limit": 1000,  # increase this or use pagination
    "active": "true",
    "market": "stocks"
}

response = requests.get(url, params=params)
tickers = response.json().get("results", [])

# Step 2: Organize by region
region_dict = defaultdict(list)

for ticker in tickers:
    ex = ticker.get("primary_exchange", "")
    country = exchange_to_country.get(ex)
    if country:
        region_dict[country].append(ticker)

# Step 3: Sort by market cap and select top 10
top_10_per_region = {}
for region, companies in region_dict.items():
    sorted_companies = sorted(companies, key=lambda x: x.get("market_cap", 0), reverse=True)
    top_10_per_region[region] = sorted_companies[:10]

# Step 4: Print results
for region, top_10 in top_10_per_region.items():
    print(f"Top 10 in {region}:")
    for company in top_10:
        print(f"{company['ticker']} - {company.get('name', '')} - Market Cap: {company.get('market_cap')}")
    print("-" * 40)


Top 10 in US:
A - Agilent Technologies Inc. - Market Cap: None
AA - Alcoa Corporation - Market Cap: None
AACB - Artius II Acquisition Inc. Class A Ordinary Shares - Market Cap: None
AACBR - Artius II Acquisition Inc. Rights - Market Cap: None
AACBU - Artius II Acquisition Inc. Units - Market Cap: None
AACG - ATA Creativity Global American Depositary Shares - Market Cap: None
AACIU - Armada Acquisition Corp. II Units - Market Cap: None
AACT - Ares Acquisition Corporation II - Market Cap: None
AACT.U - Ares Acquisition Corporation II Units, each consisting of one Class A ordinary share and one-half of one redeemable warrant - Market Cap: None
AACT.WS - Ares Acquisition Corporation II Redeemable Warrants, each whole warrant exercisable for one Class A ordinary share at an exercise price of $11.50 - Market Cap: None
----------------------------------------


In [63]:
top_10_per_region

{'US': [{'ticker': 'A',
   'name': 'Agilent Technologies Inc.',
   'market': 'stocks',
   'locale': 'us',
   'primary_exchange': 'XNYS',
   'type': 'CS',
   'active': True,
   'currency_name': 'usd',
   'cik': '0001090872',
   'composite_figi': 'BBG000C2V3D6',
   'share_class_figi': 'BBG001SCTQY4',
   'last_updated_utc': '2025-05-21T00:00:00Z'},
  {'ticker': 'AA',
   'name': 'Alcoa Corporation',
   'market': 'stocks',
   'locale': 'us',
   'primary_exchange': 'XNYS',
   'type': 'CS',
   'active': True,
   'currency_name': 'usd',
   'cik': '0001675149',
   'composite_figi': 'BBG00B3T3HD3',
   'share_class_figi': 'BBG00B3T3HF1',
   'last_updated_utc': '2025-05-29T00:00:00Z'},
  {'ticker': 'AACB',
   'name': 'Artius II Acquisition Inc. Class A Ordinary Shares',
   'market': 'stocks',
   'locale': 'us',
   'primary_exchange': 'XNAS',
   'type': 'CS',
   'active': True,
   'currency_name': 'usd',
   'cik': '0002034334',
   'last_updated_utc': '2025-05-29T00:00:00Z'},
  {'ticker': 'AACBR',
 

In [64]:
# Fetch OHLCV time series for a given ticker
def fetch_ohlcv(ticker, api_key, timespan="day", from_date="2024-01-01", to_date="2025-05-28"):
    url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/{timespan}/{from_date}/{to_date}"
    params = {"adjusted": "true", "sort": "asc", "limit": 50000, "apiKey": api_key}
    res = requests.get(url, params=params).json()
    return res["results"] if "results" in res else []


In [66]:
import pandas as pd
from tqdm import tqdm

api_key = POLYGON_APIKEY


all_data = []

for region, companies in top_10_per_region.items():
    for company in companies:
        ticker

for ticker in tqdm(tickers):
    data = fetch_ohlcv(ticker, api_key)
    for d in data:
        d["ticker"] = ticker
        all_data.append(d)

df = pd.DataFrame(all_data)
df["t"] = pd.to_datetime(df["t"], unit="ms")  # timestamp to datetime
df.rename(columns={"t": "date", "c": "close", "o": "open", "h": "high", "l": "low", "v": "volume"}, inplace=True)
df.to_csv("historical_prices.csv", index=False)


  9%|▉         | 89/1000 [01:08<11:44,  1.29it/s]


JSONDecodeError: Extra data: line 1 column 5 (char 4)

In [38]:
df

Unnamed: 0,volume,vw,open,close,high,low,date,n,ticker
0,0.326792,248.3450,0.326594,0.314881,0.310404,0.331362,2024-01-02 05:00:00,1177663,TSLA
1,0.406343,239.8167,0.311385,0.285368,0.294217,0.305978,2024-01-03 05:00:00,1273469,TSLA
2,0.316933,240.2989,0.294298,0.283828,0.285557,0.310402,2024-01-04 05:00:00,1001611,TSLA
3,0.267884,237.8911,0.287171,0.282526,0.278058,0.301523,2024-01-05 05:00:00,934668,TSLA
4,0.232411,238.5962,0.285024,0.291288,0.281343,0.302778,2024-01-08 05:00:00,970810,TSLA
...,...,...,...,...,...,...,...,...,...
1051,0.211519,454.3728,0.892956,0.867345,0.898238,0.894326,2025-05-21 04:00:00,320670,MSFT
1052,0.190612,456.1561,0.896228,0.887611,0.922018,0.911757,2025-05-22 04:00:00,334105,MSFT
1053,0.170568,451.0242,0.853427,0.846195,0.858862,0.870059,2025-05-23 04:00:00,323259,MSFT
1054,0.242362,458.9856,0.909404,0.939204,0.928757,0.930267,2025-05-27 04:00:00,362158,MSFT


In [37]:
# Basic preprocessing
df = pd.read_csv("historical_prices.csv")
df['date'] = pd.to_datetime(df['date'])

# Normalize per ticker
from sklearn.preprocessing import MinMaxScaler

def scale_ticker(df, columns):
    scalers = {}
    for ticker in df['ticker'].unique():
        scaler = MinMaxScaler()
        mask = df['ticker'] == ticker
        df.loc[mask, columns] = scaler.fit_transform(df.loc[mask, columns])
        scalers[ticker] = scaler
    return df, scalers

df, scalers = scale_ticker(df, ["close", "open", "high", "low", "volume"])
