<a href="https://colab.research.google.com/github/smbock42/deeplearningbets/blob/main/Lab_4_1_Dataset_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

I chose to make a model that takes in 5 years of stock data and will decide whether to buy, sell, or hold that stock position.

I am comparing a baseline Linear NN with two more advanced models to see whether predictions and accuracy improve with more complexity.

In [None]:
%pip install pandas numpy yfinance scikit-learn torch matplotlib


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# Choose Stock and Download Data

In [None]:
stock_ticker = 'TSLA'  # Stock Ticker to predict buy, sell, or hold
sp500_ticker = '^GSPC'  # S&P 500 index

start_date = '2025-01-01'
end_date = datetime.date.today().strftime('%Y-%m-%d')
# Fetch the data using yfinance
stock_data = yf.download(stock_ticker)
sp500_data = yf.download(sp500_ticker, start=start_date, end=end_date)

data = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']].copy()
data['SP500_Close'] = sp500_data['Close']

data.columns = data.columns.get_level_values(0)

data.ffill(inplace=True)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [None]:

def add_technical_indicators(df):
    """Add technical indicators to the dataframe."""
    # Make a copy to avoid modifying the original dataframe
    df = df.copy()

    # Moving Averages
    df['sma_5'] = df['Close'].rolling(window=5).mean()
    df['sma_20'] = df['Close'].rolling(window=20).mean()
    df['ema_5'] = df['Close'].ewm(span=5, adjust=False).mean()
    df['ema_20'] = df['Close'].ewm(span=20, adjust=False).mean()

    # RSI
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['rsi'] = 100 - (100 / (1 + rs))

    # Bollinger Bands
    df['bb_middle'] = df['sma_20']
    std = df['Close'].rolling(window=20).std()
    df['bb_upper'] = df['bb_middle'] + 2 * std
    df['bb_lower'] = df['bb_middle'] - 2 * std

    # MACD
    df['ema_12'] = df['Close'].ewm(span=12, adjust=False).mean()
    df['ema_26'] = df['Close'].ewm(span=26, adjust=False).mean()
    df['macd'] = df['ema_12'] - df['ema_26']
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']

    # Volume indicators
    df['obv'] = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()

    # Volatility
    df['atr'] = df['High'].rolling(14).max() - df['Low'].rolling(14).min()

    # Price change and momentum
    df['price_change'] = df['Close'].pct_change()
    df['price_sma20_ratio'] = df['Close'] / df['sma_20']

    return df

# Apply technical indicators to your stock data
stock_data_with_indicators = add_technical_indicators(stock_data)

# Drop NaN values that result from calculating indicators
stock_data_with_indicators = stock_data_with_indicators.dropna()

# Optionally, display a sample of the data with indicators
print(stock_data_with_indicators.tail())

In [None]:
stock = yf.Ticker('TSLA')


In [None]:
info = stock.info
data['P/E'] = info.get('forwardPE', None)
data['P/B'] = info.get('priceToBook', None)
data['Dividend Yield'] = info.get('dividendYield', None)
data['Market Cap'] = info.get('marketCap', None)
data['52 Week High'] = info.get('fiftyTwoWeekHigh', None)
data['52 Week Low'] = info.get('fiftyTwoWeekLow', None)


In [None]:
data

Price,Open,High,Low,Close,Volume,SP500_Close,P/E,P/B,Dividend Yield,Market Cap,52 Week High,52 Week Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-06-29,1.266667,1.666667,1.169333,1.592667,281494500,,74.28395,10.615737,,774151995392,488.54,138.8
2010-06-30,1.719333,2.028000,1.553333,1.588667,257806500,,74.28395,10.615737,,774151995392,488.54,138.8
2010-07-01,1.666667,1.728000,1.351333,1.464000,123282000,,74.28395,10.615737,,774151995392,488.54,138.8
2010-07-02,1.533333,1.540000,1.247333,1.280000,77097000,,74.28395,10.615737,,774151995392,488.54,138.8
2010-07-06,1.333333,1.333333,1.055333,1.074000,103003500,,74.28395,10.615737,,774151995392,488.54,138.8
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-07,259.320007,266.250000,250.729996,262.670013,102369600,5770.200195,74.28395,10.615737,,774151995392,488.54,138.8
2025-03-10,252.539993,253.369995,220.000000,222.149994,189076900,5614.560059,74.28395,10.615737,,774151995392,488.54,138.8
2025-03-11,225.309998,237.059998,217.020004,230.580002,174896400,5572.069824,74.28395,10.615737,,774151995392,488.54,138.8
2025-03-12,247.220001,251.839996,241.100006,248.089996,142215700,5599.299805,74.28395,10.615737,,774151995392,488.54,138.8


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Setup Dataloaders

In [None]:
# TODO: add technical indicators to features list
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'SP500_Close']
scaler = StandardScaler()
X = scaler.fit_transform(data[features])
y = data['Label'].values

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)


batch_size = 1024
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)


In [None]:
stock_ticker = 'MSTR'  # Stock Ticker to predict buy, sell, or hold
sp500_ticker = '^GSPC'  # S&P 500 index

start_date = '2025-01-01'
end_date = datetime.date.today().strftime('%Y-%m-%d')
# Fetch the data using yfinance
stock_data = yf.download(stock_ticker)
sp500_data = yf.download(sp500_ticker, start=start_date, end=end_date)

data = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']].copy()
data['SP500_Close'] = sp500_data['Close']

data.columns = data.columns.get_level_values(0)

data.ffill(inplace=True)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Collecting Next_Open, Next_Close and Percentage_Change data columns

In [None]:
data['Next_Open'] = data['Open'].shift(-1)
data['Next_Close'] = data['Close'].shift(-1)
data['Percentage_Change'] = ((data['Next_Close'] - data['Close']) / data['Close']) * 100

data.dropna(subset = ['Open', 'High', 'Low', 'Close', 'Volume', 'SP500_Close', 'Next_Open', 'Next_Close', 'Percentage_Change'], inplace=True)

In [None]:
data

Price,Open,High,Low,Close,Volume,SP500_Close,Next_Open,Next_Close,Percentage_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2025-01-02,300.109985,310.799988,292.320007,300.01001,17933900,5868.549805,303.0,339.660004,13.216224
2025-01-03,303.0,343.399994,301.769989,339.660004,25039000,5942.470215,349.850006,379.089996,11.608665
2025-01-06,349.850006,383.019989,335.5,379.089996,26707300,5975.379883,366.470001,341.429993,-9.934317
2025-01-07,366.470001,371.279999,335.299988,341.429993,24736800,5909.029785,335.25,331.700012,-2.849773
2025-01-08,335.25,344.390015,317.220001,331.700012,18274800,5918.25,330.309998,327.910004,-1.142601
2025-01-10,330.309998,337.0,317.910004,327.910004,16478700,5827.040039,309.950012,328.399994,0.149428
2025-01-13,309.950012,329.600006,303.799988,328.399994,15914500,5836.220215,343.0,342.170013,4.193063
2025-01-14,343.0,354.899994,331.299988,342.170013,17889500,5842.910156,357.809998,360.619995,5.392051
2025-01-15,357.809998,368.420013,353.01001,360.619995,19941300,5949.910156,357.890015,367.0,1.769177
2025-01-16,357.890015,370.26001,345.799988,367.0,14826600,5937.339844,383.279999,396.5,8.038147


In [None]:
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'SP500_Close']
X_scaler = StandardScaler()
y_scaler = StandardScaler()
X = X_scaler.fit_transform(data[features].values)  # Convert the features into a numpy array
y = y_scaler.fit_transform(data['Percentage_Change'].values.reshape(-1, 1))  # Target is the next day's close price

print(y)

# Split the data into train and test sets while maintaining the indices
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Now, create DataLoader
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=1024, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=1024, shuffle=False)

[[ 2.21498951]
 [ 1.94750857]
 [-1.63701885]
 [-0.45822465]
 [-0.17416906]
 [ 0.04081117]
 [ 0.71362982]
 [ 0.91312887]
 [ 0.3103204 ]
 [ 1.35341164]
 [-0.29458977]
 [-0.48822441]
 [-0.16882664]
 [-0.85140686]
 [-0.25456983]
 [-0.5574639 ]
 [ 0.27945327]
 [-0.04061257]
 [-0.24335486]
 [ 0.62725331]
 [ 0.07443283]
 [-0.53866738]
 [-0.53950864]
 [ 0.12330948]
 [ 0.37457221]
 [-0.73788227]
 [ 0.39929158]
 [-0.08078402]
 [ 0.67194004]
 [-0.16929666]
 [-0.74632367]
 [ 0.29007005]
 [-1.22868683]
 [-0.92401477]
 [-1.88179781]
 [ 0.86347089]
 [-1.45158107]
 [ 1.08200481]
 [-0.27783753]
 [ 1.62268243]
 [ 2.03572269]
 [-0.22348499]
 [-0.9103532 ]
 [-2.7599147 ]
 [ 1.4985508 ]
 [ 0.14109546]
 [ 0.0609451 ]]


In [None]:
def create_sequences(data, sequence_length=30):
    xs, ys = [], []
    for i in range(len(data)-sequence_length-1):
        x = data[i:(i+sequence_length)]
        y = data[i+sequence_length+1, 0]  # Percentage_Change is first column
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# After calculating technical indicators:
features = ['Percentage_Change'] + [col for col in data.columns if col not in ['Next_Close', 'Next_Open']]
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[features])

# Create sequences
X, y = create_sequences(scaled_data)
train_size = int(0.8 * len(X))

X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
class ImprovedStockRegressor(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedStockRegressor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.dropout3 = nn.Dropout(0.2)

        self.fc4 = nn.Linear(64, 32)
        self.bn4 = nn.BatchNorm1d(32)
        self.dropout4 = nn.Dropout(0.1)

        self.fc5 = nn.Linear(32, 1)  # Final output layer with 1 output for regression

        self.activation = nn.LeakyReLU(negative_slope=0.01)

    def forward(self, x):
        x = self.activation(self.bn1(self.fc1(x)))
        x = self.dropout1(x)

        x = self.activation(self.bn2(self.fc2(x)))
        x = self.dropout2(x)

        x = self.activation(self.bn3(self.fc3(x)))
        x = self.dropout3(x)

        x = self.activation(self.bn4(self.fc4(x)))
        x = self.dropout4(x)

        x = self.fc5(x)  # Raw output for regression
        return x


input_dim = len(features)
improved_model = ImprovedStockRegressor(input_dim)

criterion = nn.L1Loss()
optimizer = optim.AdamW(improved_model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
for inputs, labels in test_loader:
  print(f"input: {X_scaler.inverse_transform(inputs)}")
  print(f"labels: {y_scaler.inverse_transform(labels)}")

input: [[3.15959992e+02 3.31170013e+02 3.14190003e+02 3.26820007e+02
  9.83790003e+06 6.05197021e+03]
 [2.93149993e+02 2.95100004e+02 2.46100005e+02 2.50920000e+02
  3.05756997e+07 5.84972021e+03]
 [3.32980011e+02 3.38500000e+02 3.18700013e+02 3.19459991e+02
  1.20171996e+07 6.06850000e+03]
 [2.98149993e+02 3.10440003e+02 2.81440003e+02 2.87179992e+02
  2.27644999e+07 5.77020020e+03]
 [3.35000000e+02 3.44399994e+02 3.27000000e+02 3.27559997e+02
  1.50497001e+07 6.02599023e+03]
 [2.45390002e+02 2.64459993e+02 2.44009995e+02 2.63269987e+02
  1.98769000e+07 5.95606006e+03]
 [3.84910002e+02 3.90350007e+02 3.71049987e+02 3.77309998e+02
  1.60003000e+07 6.08637012e+03]
 [3.40109985e+02 3.52709991e+02 3.32000000e+02 3.34790009e+02
  1.55660001e+07 6.04052979e+03]
 [3.35250000e+02 3.44390014e+02 3.17220001e+02 3.31700012e+02
  1.82748000e+07 5.91825000e+03]
 [3.33500000e+02 3.40380005e+02 3.28299988e+02 3.34619995e+02
  1.01041003e+07 6.06643994e+03]]
labels: [[ -0.58135787]
 [  9.6564627 ]
 [

In [None]:
num_epochs = 1000
best_improved_loss = float("inf")
improved_model_path = "best_improved_model.pth"

for epoch in range(num_epochs):
    improved_model.train()
    total_loss = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = improved_model(inputs)

        loss = criterion(outputs, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(improved_model.parameters(), max_norm=1.0)

        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Evaluation phase
    improved_model.eval()
    total_val_loss = 0
    predictions = []
    actuals = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = improved_model(inputs)
            # Store predictions and actual values for comparison
            predictions.append(outputs.detach().numpy())  # Detach and convert to numpy
            actuals.append(labels.numpy())  # Convert labels to numpy
            val_loss = criterion(outputs, labels)
            total_val_loss += val_loss.item()


    predictions = np.concatenate(predictions, axis=0)
    actuals = np.concatenate(actuals, axis=0)

    # Inverse scale the predictions to get them back to original stock price range
    predictions = y_scaler.inverse_transform(predictions)
    actuals = y_scaler.inverse_transform(actuals)

    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = mean_absolute_percentage_error(actuals, predictions)

    avg_val_loss = total_val_loss / len(test_loader)

    if avg_val_loss < best_improved_loss:
        best_improved_loss = avg_val_loss
        torch.save(improved_model.state_dict(), improved_model_path)

    scheduler.step()

    if epoch % 10 == 0:
        for i in range(len(predictions)):
          if i % 10 == 0:

            predicted_percentage = predictions[i][0]  # Predicted relative percentage
            actual_percentage = actuals[i][0]  # Actual relative percentage
             # **Corrected**: Calculate the actual price from the relative percentage
            row = data.loc[np.isclose(data['Percentage_Change'], actual_percentage)].index[0]
            predicted_price = data['Close'].loc[row] * (1 + (predicted_percentage / 100))  # Correctly scale back to price
            actual_price = data['Close'].loc[row] * (1 + actual_percentage / 100)

            print(f"Predicted Relative Percentage Change: {predicted_percentage:.4f}%, Actual Relative Percentage Change: {actual_percentage:.4f}%")
            print(f"Predicted Next-Day Price: {predicted_price:.2f}, Actual Next-Day Price: {actual_price:.2f}")

        print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {avg_train_loss:.4f} | "
              f"Test Loss: {avg_val_loss:.4f} | MAPE: {mape:.2f}%\n")

print(f"\nBest improved model saved to: {improved_model_path} with Test Loss: {best_improved_loss:.4f}")

Predicted Relative Percentage Change: -0.3508%, Actual Relative Percentage Change: -0.5814%
Predicted Next-Day Price: 325.67, Actual Next-Day Price: 324.92
Epoch [1/1000] | Train Loss: 0.6395 | Test Loss: 0.8510 | MAPE: 99.03%

Predicted Relative Percentage Change: -0.3509%, Actual Relative Percentage Change: -0.5814%
Predicted Next-Day Price: 325.67, Actual Next-Day Price: 324.92
Epoch [11/1000] | Train Loss: 0.6520 | Test Loss: 0.8469 | MAPE: 99.21%

Predicted Relative Percentage Change: -0.3603%, Actual Relative Percentage Change: -0.5814%
Predicted Next-Day Price: 325.64, Actual Next-Day Price: 324.92
Epoch [21/1000] | Train Loss: 0.7513 | Test Loss: 0.8481 | MAPE: 97.36%

Predicted Relative Percentage Change: -0.1947%, Actual Relative Percentage Change: -0.5814%
Predicted Next-Day Price: 326.18, Actual Next-Day Price: 324.92
Epoch [31/1000] | Train Loss: 0.6739 | Test Loss: 0.8554 | MAPE: 101.13%

Predicted Relative Percentage Change: -0.1262%, Actual Relative Percentage Change: -