# Model Training
Trains a model on past data using multiple tickers.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import settings
import apis.tiingo_api as tiingo

secret_key= settings.get_secret("tiingo-key")
client = tiingo.TiingoAPI(secret_key)

In [None]:
csv_data2020= client.download_ticker(TICKER, datetime(2020, 1, 1), datetime(2021,1,1), 15, False)
csv_data2021= client.download_ticker(TICKER, datetime(2021, 1, 1), datetime(2022,1,1), 15, False)
csv_data2022= client.download_ticker(TICKER, datetime(2022, 1, 1), datetime(2023,1,1), 15, False)
csv_data2023= client.download_ticker(TICKER, datetime(2023, 1, 1), datetime(2023,11,11), 15, False)

In [None]:
# Concatenate list skipping the header row
#csv_data=csv_data2020+ "\n" + csv_data2021[1:-1]+ "\n" + csv_data2022[1:-1]+ "\n" + csv_data2023[1:-1]
#len(csv_data)

In [None]:
import io
import pandas as pd

df2020 = pd.read_csv(io.StringIO(csv_data2020))
df2021 = pd.read_csv(io.StringIO(csv_data2021))
df2022 = pd.read_csv(io.StringIO(csv_data2022))
df2023 = pd.read_csv(io.StringIO(csv_data2023))
df = pd.concat([df2020, df2021, df2022, df2023], axis=0, ignore_index=True)

In [None]:
close_list = df['close'].astype(float).tolist()
close_list

In [None]:
classes_window= 52
down_pcts= [7]
up_pcts= [7]
calculator = classes_calc.ClassesCalc(classes_calc.find_first_up_down, classes_window, down_pcts, up_pcts)

In [None]:
classes= calculator.calculate(close_list)
print(classes[-classes_window-1:-classes_window+1])

In [None]:
import matplotlib.pyplot as plt

# Create a histogram
hist_values, bin_edges, _ = plt.hist(classes, bins=21, edgecolor='black')

# Add labels and a title
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Data')

# Display frequency on top of each bar
for value, edge in zip(hist_values, bin_edges[:-1]):
    plt.text(edge + 0.5, value + 0.1, str(int(value)), color='black')
    
# Show the histogram
plt.show()


In [None]:
from collections import Counter

# Calculate the frequency of each element
frequency_dict = Counter(classes)

# Print the result
num_ticks= len(classes)
print(f"Stock ticks: {num_ticks}")
for element, frequency in frequency_dict.items():
    percent=0
    position= element - len(down_pcts)
    if position < 0:
        percent= -1 * down_pcts[-1 * position - 1] 
    elif position > 0:
        percent= up_pcts[position - 1]
        
    print(f"{percent}% change ({element}): {frequency} times {(frequency/num_ticks*100):0.2f}%")


In [None]:
import importlib

import signals_calc

# Reload the module when changes are made
importlib.reload(signals_calc)

signal_windows= [2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584]
signals_calculator = signals_calc.SignalsCalc(signal_windows)

windows_rolling_avg = signals_calculator.calculate(close_list)
windows_rolling_avg

In [None]:
print(len(close_list))
print(len(windows_rolling_avg[len(signal_windows)-1]))

In [None]:


from src.classificators.proportions_calc import calculate_proportions


proportions= calculate_proportions(close_list, windows_rolling_avg)


In [None]:
print(close_list[-10:])
print(windows_rolling_avg[0][-10:])
print(proportions[0][-10:])
current_sum = sum(close_list[-2:])/2
classes_len= len(classes)
signal_windows_len= len(signal_windows)

print(f"Signal window last: {signal_windows[-1]} len: {signal_windows_len}")
print(f"Classes last non-nan: {classes[-classes_window-1:-classes_window+1]} len: {classes_len}")
print(f"Proportions first non-nan: {proportions[signal_windows_len-1][signal_windows[-1]-2:signal_windows[-1]]} len: {len(proportions[signal_windows_len-1])}")
print(f"Proportions {signal_windows[0]} Min: {min(proportions[0][signal_windows[0]-1:-classes_window-1])} Max: {max(proportions[0][signal_windows[0]-1:-classes_window])}")
print(f"Proportions {signal_windows[-1]} Min: {min(proportions[-1][signal_windows[-1]-1:-classes_window-1])} Max: {max(proportions[-1][signal_windows[-1]-1:-classes_window])}")

In [None]:
# Cutting from first non-nan in the signal_windows[-1] to last non-nan in the classes
targets = classes[signal_windows[-1]-1:-classes_window]
print(f"First target: {targets[0]} and last target: {targets[-1]}")
print(f"Classes: {len(classes)} cut to targets: {len(targets)}")
inputs = []
for proportion in proportions:
    proportion_cut=proportion[signal_windows[-1]-1:-classes_window]
    print(proportion_cut[0:2])
    inputs.append(proportion_cut)
    
print(f"Inputs {len(inputs[len(signal_windows)-1])}")
print(f"Distinct targets: {list(set(targets))}")


In [None]:
import torch

# TODO: Only 4 decimals in the X values, should not be more? How to increase?
X = torch.Tensor(inputs)
X = X.T
y = torch.Tensor(targets)

In [None]:
train_split= int(0.8 * len(X))
X_train, y_train= X[:train_split], y[:train_split]
X_test, y_test= X[train_split:], y[train_split:]

In [None]:
print(f"X_train shape: {X_train.shape} y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape} y_test shape: {y_test.shape}")
print(f"X_train: {X_train}")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

X_train, y_train= X_train.to(device), y_train.to(device).to(torch.int64)
X_test, y_test= X_test.to(device), y_test.to(device).to(torch.int64)

print(f"X_train shape: {X_train.shape} type: {X_train.dtype} y_train shape: {y_train.shape} type: {y_train.dtype}")
print(f"X_test shape: {X_test.shape} y_test shape: {y_test.shape}")
print(f"X_train: {X_train}")
print(f"y_train: {y_train}")

In [None]:
import torch
from torch import nn

class StockModelV0(nn.Module):
  def __init__(self, input_features, output_features, hidden_units):
    """Initializes multi-class classification model"""
    super().__init__()
    self.linear_layer_stack = nn.Sequential(
      nn.Linear(in_features=input_features, out_features=hidden_units*3),
      nn.ReLU(),
      nn.Linear(in_features=hidden_units*3, out_features=hidden_units*2),
      nn.ReLU(),
      nn.Linear(in_features=hidden_units*2, out_features=hidden_units),
      nn.ReLU(),
      nn.Linear(in_features=hidden_units, out_features=output_features)
    )

  def forward(self, x):
    # print("forward x: ",", ".join([str(num) for num in x.tolist()]))
    # Layers are defined inside the Sequencial NN and will be applied here.
    return self.linear_layer_stack(x)

# Create an instance of the model
model_0 = StockModelV0(
  input_features=len(signal_windows),
  output_features=len(down_pcts)+ 1 + len(up_pcts),
  hidden_units=10).to(device)


In [None]:
from torchmetrics import Accuracy

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=model_0.parameters(), lr=0.5)
accuracy_fn= Accuracy(task='multiclass', num_classes=len(down_pcts)+ 1 + len(up_pcts)).to(device)

In [None]:
print(f"y_train: {y_train}")
print(f"y_train.dtype: {y_train.dtype}")
model_0.eval()
with torch.inference_mode():
    y_logits= model_0(X_train)
    print(y_logits)
    y_pred= torch.argmax(torch.softmax(y_logits, dim=1), dim=1)
    print(f"y_pred: ", ", ".join([str(num) for num in y_pred.tolist()]))
    print(f"y_pred.dtype: {y_pred.dtype}")



In [None]:
epochs=10000

for epoch in range(epochs):
  # Training
  model_0.train()

  # Forward pass
  y_logits= model_0(X_train)
    
  # turn logits -> prediction probabilities -> prediction labels
  y_pred= torch.argmax(torch.softmax(y_logits, dim=1), dim=1)
  
  # Calculate loss and accuracy
  loss= loss_fn(y_logits, y_train)
  accuracy = accuracy_fn(y_pred, y_train)
    
  # Optimize zero grad
  optimizer.zero_grad()

  # Backpropagation
  loss.backward()

  # Gradient descent optimization
  optimizer.step()

  model_0.eval()
  with torch.inference_mode():
    # Predict for test data
    test_logits= model_0(X_test)
    test_pred= torch.argmax(torch.softmax(test_logits, dim=1), dim=1)

    # Calculate test loss/accuracy
    test_loss= loss_fn(test_logits, y_test)
    test_accuracy = accuracy_fn(test_pred, y_test)

    if epoch % 100 == 0: 
      print(f"Epoch: {epoch} | Loss: {loss:.5f} Acc: {accuracy*100:.2f}% | Test loss: {test_loss:.5f} Test acc: {test_accuracy*100:.2f}%")



In [None]:
print(f"Distinct y_test: {list(set(y_test.tolist()))}")
print(f"Distinct test_pred: {list(set(test_pred.tolist()))}")

In [None]:
from torchmetrics import ConfusionMatrix
# plot_confusion_matrix will plot the metrix in a nicer way
from mlxtend.plotting import plot_confusion_matrix

confmat= ConfusionMatrix(
  task='multiclass',
  num_classes=len(down_pcts)+ 1 + len(up_pcts))

# test_data.targets are the values we want to predict in the test dataloader
confmat_tensor= confmat(
  preds= test_pred.cpu(),
  target= y_test.cpu())

# Plot the confusion matrix
fig, ax= plot_confusion_matrix(
  conf_mat= confmat_tensor.numpy(),
  figsize= (10, 7)
)


In [None]:
try:
  import torchinfo
except:
  !pip install torchinfo
  import torchinfo

from torchinfo import summary  


In [None]:
summary(model_0, input_size=[len(signal_windows)])