In [11]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import pandas as pd

In [12]:
stock_symbols = ["AAPL", "GOOG", "MSFT"]  
prediction_horizon = "daily"

In [13]:
# Step 2: Collect Data
api_key = "0FHEZ51MP59ZHZM0"
data = {}
for stock_symbol in stock_symbols:
    url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={stock_symbol}&apikey={api_key}"
    response = requests.get(url)
    data[stock_symbol] = response.json()


In [30]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()
train_data = []
test_data = []

# Preprocess the data for each stock symbol
for stock_symbol in stock_symbols:
    try:
        # Transform JSON data to DataFrame
        df = pd.DataFrame(data[stock_symbol]["Time Series (Daily)"]).T
        df.columns = ['open', 'high', 'low', 'close', 'volume']
        df = df[['open', 'high', 'low', 'close']].astype(float)
        df["date"] = pd.to_datetime(df.index)
        df.set_index("date", inplace=True)
        df = df.sort_index()
        df_scaled = scaler.fit_transform(df)
        
        # Split data into training and testing sets
        train_size = int(0.8 * len(df_scaled))
        train_data.append(df_scaled[:train_size])
        test_data.append(df_scaled[train_size:])
    except KeyError as e:
        print(f"Data format error for {stock_symbol}: {e}")

    # Print the lengths of the data
    print(f"Length of {stock_symbol} data: {len(df_scaled)}")
    print(f"Train size: {train_size}")
    print(f"Test size: {len(df_scaled) - train_size}")


Length of AAPL data: 100
Train size: 80
Test size: 20
Length of GOOG data: 100
Train size: 80
Test size: 20
Length of MSFT data: 100
Train size: 80
Test size: 20


In [28]:
class StockDataset(Dataset):
    def __init__(self, data, sequence_len):
        self.data = data
        self.sequence_len = sequence_len

        if len(self.data) <= self.sequence_len:
            raise ValueError(f"Data length {len(self.data)} is not sufficient for sequence length {self.sequence_len}")

    def __len__(self):
        return len(self.data) - self.sequence_len

    def __getitem__(self, idx):
        sequence = self.data[idx:idx + self.sequence_len]
        label = self.data[idx + self.sequence_len, 3]  # Predicting the close price
        return {
            'sequence': torch.tensor(sequence).float(),
            'label': torch.tensor(label).float()
        }


In [29]:
# Define sequence length and batch size
sequence_len = 30
batch_size = 32

# Create DataLoader for training and testing data
train_datasets = []
test_datasets = []

for stock_data in train_data:
    if len(stock_data) > sequence_len:
        train_datasets.append(StockDataset(stock_data, sequence_len))
    else:
        print(f"Training data length {len(stock_data)} is insufficient for sequence length {sequence_len}")

for stock_data in test_data:
    if len(stock_data) > sequence_len:
        test_datasets.append(StockDataset(stock_data, sequence_len))
    else:
        print(f"Testing data length {len(stock_data)} is insufficient for sequence length {sequence_len}")

train_dataloaders = [DataLoader(dataset, batch_size=batch_size, shuffle=True) for dataset in train_datasets]
test_dataloaders = [DataLoader(dataset, batch_size=batch_size, shuffle=False) for dataset in test_datasets]


Testing data length 30 is insufficient for sequence length 30
Testing data length 30 is insufficient for sequence length 30
Testing data length 30 is insufficient for sequence length 30


In [22]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(1, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Set device and initialize model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = LSTMModel(input_dim=4, hidden_dim=50, output_dim=1).to(device)


In [23]:
# Define loss function and optimizer
criterion = nn.MSELoss().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
pbar = tqdm(range(100))
train_total_loss = []
test_total_loss = []

for epoch in pbar:
    train_loss = 0
    test_loss = 0
    model.train()
    
    for batch in train_dataloaders[0]:
        optimizer.zero_grad()
        inputs_seq = batch['sequence'].to(device)
        targets = batch['label'].to(device)
        tr_outputs = model(inputs_seq).squeeze()
        tr_loss = criterion(tr_outputs, targets)
        train_loss += tr_loss.item()
        tr_loss.backward()
        optimizer.step()
    
    tr_epoch_loss = train_loss / len(train_dataloaders[0])
    train_total_loss.append(tr_epoch_loss)
    
    model.eval()
    for batch in test_dataloaders[0]:
        with torch.no_grad():
            inputs_seq = batch['sequence'].to(device)
            targets = batch['label'].to(device)
            te_outputs = model(inputs_seq).squeeze()
            te_loss = criterion(te_outputs, targets)
            test_loss += te_loss.item()
    
    te_epoch_loss = test_loss / len(test_dataloaders[0])
    test_total_loss.append(te_epoch_loss)
    pbar.set_description(f"Epoch {epoch+1}, Train Loss: {tr_epoch_loss:.4f}, Test Loss: {te_epoch_loss:.4f}")


  0%|          | 0/100 [00:01<?, ?it/s]


ValueError: __len__() should return >= 0

In [None]:
# Step 7: Fine-Tuning
def fine_tune(model, stock_symbols, data, sequence_len, batch_size, epochs):
    best_model = model
    best_loss = float('inf')
    for stock_symbol in stock_symbols:
        X_train, X_test, y_train, y_test = split_data(data[stock_symbol], sequence_len)
        train_dataloader = DataLoader(dataset=StockDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
        test_dataloader = DataLoader(dataset=StockDataset(X_test, y_test), batch_size=batch_size, shuffle=False)
        
        for epoch in range(epochs):
            model.train()
            total_loss = 0
            for i, (inputs, targets) in enumerate(train_dataloader):
                optimizer.zero_grad()
                inputs = inputs.to(device)
                targets = targets.squeeze().to(device)
                outputs = model(inputs).squeeze()
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            model.eval()
            with torch.no_grad():
                total_test_loss = 0
                for i, (inputs, targets) in enumerate(test_dataloader):
                    inputs = inputs.to(device)
                    targets = targets.squeeze().to(device)
                    outputs = model(inputs).squeeze()
                    loss = criterion(outputs, targets)
                    total_test_loss += loss.item()
            test_loss = total_test_loss / len(test_dataloader)
            if test_loss < best_loss:
                best_loss = test_loss
                best_model = model
            print(f"Epoch {epoch+1}, Stock {stock_symbol}, Train Loss: {total_loss / len(train_dataloader):.4f}, Test Loss: {test_loss:.4f}")
    return best_model