# Model Test
Uses a pre-trained model to make predictions for dates that occur after the last date included in the training data.

In [None]:
# Enables project modules auto reloading when changed
%load_ext autoreload
%autoreload 2

In [None]:
# Checks if GPU is available
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

In [None]:
# Adds the project root folder to sys.path so project modules can be found from the Jupyter notebook
# Assumes this notebook is located 2 levels up from the project root folder
import sys
from pathlib import Path

current_path = Path.cwd()
project_root = str(current_path.parent.parent)
if project_root not in sys.path:
    sys.path.append(project_root)
print("Updated PATH: ", sys.path)

In [None]:
import settings
import apis.tiingo_api as tiingo

secret_key= settings.get_secret("tiingo-key")
client = tiingo.TiingoAPI(secret_key)

### SET PARAMETERS:

In [None]:
from datetime import datetime
import utils.text_utils as tu
import utils.list_utils as lu

MODEL_NAME= f"2024-10-24-1549-TSLA-predictUP-dates20190102-20231229-days3-down300-up300-in16-hid12-pos_weight9186-prec9580pct-fp25tp570-high_prec.pth"

START_DATE = datetime(2023, 8, 15)   # Train END_DATE = (2024, 1, 1) - PRICE_AVGS = 2584 
START_DATE = datetime(2023, 10, 14)   # Train END_DATE = (2024, 1, 1) - PRICE_AVGS = 2584 
# START_DATE = datetime(2023, 4, 21)   # PRICE_AVGS = 4181
# START_DATE = datetime(2023, 1, 2)   # PRICE_AVGS = 6765
END_DATE = datetime(2024, 7, 5)

TICKERS= ["NTLA", "BEAM", "CRSP", "PACB", "EDIT", "VERV", "PRME"]
TICKERS= [tu.extract_company(MODEL_NAME)]
DATA_INTERVAL_MINUTES = 15   # (Set to 5 or 15)
DATA_AFTER_HOURS = False

DAYS_PREDICT = tu.extract_days(MODEL_NAME)
DOWN_PCTS_PREDICT= tu.extract_pcts(MODEL_NAME, "down")
UP_PCTS_PREDICT= tu.extract_pcts(MODEL_NAME, "up")

PRICE_AVGS= [
    2, 
    3, 
    5, 
    8, 
    13, 
    21, 
    34, 
    55, 
    89, 
    144, 
    233, 
    377, 
    610, 
    987, 
    1597, 
    2584,
    # 4181, # 255 working days
    # 6765 # 364 working days
]


VOLUME_AVGS= [
    2, 
    3, 
    5, 
    8, 
    13, 
    21, 
    34
]

PREDICT_UP = tu.is_predict_up(MODEL_NAME)
if PREDICT_UP:
    INDEX_KEEP= 2
    INDEX_REMOVE_A= 0
    INDEX_REMOVE_B= 1
else:
    INDEX_KEEP= 0
    INDEX_REMOVE_A= 1
    INDEX_REMOVE_B= 2

# If quotes every 15min there 26 per day if quotes every 5min there are 78 per day
TICKS_IN_DAY = 26 if DATA_INTERVAL_MINUTES == 15 else 78
# How many data ticks are inspecting to determine the if up or down by percentage 
TICKS_PREDICT= TICKS_IN_DAY * DAYS_PREDICT
REACH_PCT= 0.95

HIDDEN_UNITS=12

TEST_THRESHOLD = 0.99



In [None]:
TICKERS

# Load training data (!!!ONLY TO VERIFY):
Use to verify results: compare with results obtain when training
If don't need to verify training data Skip to step that load test data

In [None]:

### TRAINING DATA
csv_data2019= client.download_ticker(TICKERS[0], datetime(2019, 1, 1), datetime(2020,1,1), DATA_INTERVAL_MINUTES, DATA_AFTER_HOURS)
csv_data2020= client.download_ticker(TICKERS[0], datetime(2020, 1, 1), datetime(2021,1,1), DATA_INTERVAL_MINUTES, DATA_AFTER_HOURS)
csv_data2021= client.download_ticker(TICKERS[0], datetime(2021, 1, 1), datetime(2022,1,1), DATA_INTERVAL_MINUTES, DATA_AFTER_HOURS)
csv_data2022= client.download_ticker(TICKERS[0], datetime(2022, 1, 1), datetime(2023,1,1), DATA_INTERVAL_MINUTES, DATA_AFTER_HOURS)
csv_data2023= client.download_ticker(TICKERS[0], datetime(2023, 1, 1), datetime(2024,1,1), DATA_INTERVAL_MINUTES, DATA_AFTER_HOURS)
# csv_data2024= client.download_ticker(TICKERS[0], datetime(2024, 1, 1), datetime(2024,2,1), DATA_INTERVAL_MINUTES, DATA_AFTER_HOURS)

In [None]:
import io
import pandas as pd

# ### TRAINING DATA
df2019 = pd.read_csv(io.StringIO(csv_data2019))
df2020 = pd.read_csv(io.StringIO(csv_data2020))
df2021 = pd.read_csv(io.StringIO(csv_data2021))
df2022 = pd.read_csv(io.StringIO(csv_data2022))
df2023 = pd.read_csv(io.StringIO(csv_data2023))
# df2024 = pd.read_csv(io.StringIO(csv_data2024))

if not df2019.empty:
    print("Concatenating from 2019")
    df = pd.concat([df2019, df2020, df2021, df2022, df2023], axis=0, ignore_index=True)
elif not df2020.empty:
    print("Concatenating from 2020")
    df = pd.concat([df2020, df2021, df2022, df2023], axis=0, ignore_index=True)
else:
    print("Concatenating from 2021")
    df = pd.concat([df2021, df2022, df2023], axis=0, ignore_index=True)

# if not df2024.empty:
#     print("Concatenating from 2024")
#     df = pd.concat([df, df2024], axis=0, ignore_index=True)

## LOADING TEST DATA

In [None]:

# # For first prediction need signal_avg[-1]=2584 ticks -> 2584 / 26 ~ 100 days -> 100 * 7 / 5 / 30 = 4.6 months
csv_data2023= client.download_ticker(TICKERS[0], datetime(2023, 8, 1), datetime(2023,12,31), DATA_INTERVAL_MINUTES, DATA_AFTER_HOURS)
csv_data2024= client.download_ticker(TICKERS[0], datetime(2024, 1, 1), datetime(2024,5,25), DATA_INTERVAL_MINUTES, DATA_AFTER_HOURS)


In [None]:
import io
import pandas as pd

### TEST PREDICTIONS ON RECENT DATA
df2023 = pd.read_csv(io.StringIO(csv_data2023))
df2024 = pd.read_csv(io.StringIO(csv_data2024))
df = pd.concat([df2023, df2024], axis=0, ignore_index=True)


In [None]:
# Validates that data has been concatenated correctly = ordered ascending
if df["date"].is_monotonic_increasing and df["date"].is_unique:
    print("Correct: DataFrame is in ascending order.")
else:
    print("Error: DataFrame is not in ascending order.")


In [None]:
# Verify expected dates correspond with the data you intend to use
print(f"Test data first:\n{df[['date', 'close']][:5]}")
print(f"Test data last:\n{df[['date', 'close']][-5:]}")

In [None]:
import classifiers.up_down_classifier as udc
import classifiers.ewa_classifier as ec

alpha= ec.calculate_ewa_alpha(TICKS_PREDICT, REACH_PCT)
print(f"alpha: {alpha:.4f} for window: {TICKS_PREDICT} and reach: {REACH_PCT}")

classes_calc = udc.UpsDownsClassifier(TICKS_PREDICT, DOWN_PCTS_PREDICT, UP_PCTS_PREDICT)

close_prices = df['close'].astype(float).tolist()
input_data= ec.calculate_ewas(close_prices, alpha)

classes= classes_calc.classify(input_data)
print(f"Check correct '-1' point (window={TICKS_PREDICT}): {classes[-TICKS_PREDICT-1:-TICKS_PREDICT+1]}")
print(f"prices vs input_data: {[(p, c) for p, c in zip(close_prices[2650:3000], input_data[2650:3000])]}")
print(f"input_data vs classes: {[(p, c) for p, c in zip(input_data[2650:3000], classes[2650:3000])]}")


In [None]:
# Display histogram for the classes

import matplotlib.pyplot as plt

# Create a histogram
hist_values, bin_edges, _ = plt.hist(classes, bins=4, edgecolor='black')

plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Data')

# Display frequency on top of each bar
for value, edge in zip(hist_values, bin_edges[:-1]):
    plt.text(float(edge), float(value), str(int(value)), color='black')
    
plt.show()

In [None]:
# Show percentages of each class value
lu.display_frequency_classes(classes, DOWN_PCTS_PREDICT, UP_PCTS_PREDICT)

In [None]:

# Display classes value changes over time (last 500 ticks)
graph_ticks = 500
x = range(len(classes[-graph_ticks:]))

plt.figure(figsize=(20,5))
plt.plot(x, classes[-graph_ticks:], linestyle='-')

plt.xlabel('Index')
plt.ylabel('Class')
plt.title('Plot of Classes')

plt.show()

In [None]:

# Calculate the signals as input for the neural network as proportions
import preprocessing.proportions_calc as proportions

signals_calculator = proportions.ProportionsCalc(PRICE_AVGS)

proportions_avg = signals_calculator.calculate(close_prices)


In [None]:
print(f"Prices length: {len(close_prices)}")
print(f"Proportions length: {len(proportions_avg[-1])}")

print(f"Last 10 close: {close_prices[-10:]}")
print(f"Last 10 proportions(avg={PRICE_AVGS[0]}): {proportions_avg[0][-10:]}")

print(f"Proportions avgs: Length: {len(PRICE_AVGS)} Last: {PRICE_AVGS[-1]}")
# At the end of the data, when less ticks than necessary no possible to predict so "-1" 
print(f"Classes last non-negative-1: {classes[-TICKS_PREDICT-1:-TICKS_PREDICT+1]} len: {len(classes)}")
print(f"Proportions first non-negative-1(avg={PRICE_AVGS[-1]}): {proportions_avg[-1][PRICE_AVGS[-1]-2:PRICE_AVGS[-1]]} len: {len(proportions_avg[-1])}")
print(f"Proportions (avg={PRICE_AVGS[0]}) Min: {min(proportions_avg[0][PRICE_AVGS[0]-1:-TICKS_PREDICT-1])} Max: {max(proportions_avg[0][PRICE_AVGS[0]-1:-TICKS_PREDICT])}")
print(f"Proportions (avg={PRICE_AVGS[-1]}) Min: {min(proportions_avg[-1][PRICE_AVGS[-1]-1:-TICKS_PREDICT-1])} Max: {max(proportions_avg[-1][PRICE_AVGS[-1]-1:-TICKS_PREDICT])}")

In [None]:
# Removing the "-1" from the proportions
#   At the beging first signal_avg[-1] are "-1" (need previous values for first avg.)
#   At the end decided not predict if period to predict is shorter
targets = classes[PRICE_AVGS[-1]-1:-TICKS_PREDICT]
inputs = []
for proportion in proportions_avg:
    proportion_cut= proportion[PRICE_AVGS[-1]-1:-TICKS_PREDICT]
    print(proportion_cut[:2])
    inputs.append(proportion_cut)

print(f"First target: {targets[0]} and last target: {targets[-1]}")
print(f"Classes: {len(classes)} after cut to targets: {len(targets)}")
print(f"Inputs len: {len(inputs[len(PRICE_AVGS)-1])}")
print(f"Distinct targets: {list(set(targets))}")


In [None]:
from collections import Counter

print("Test data: '-1' removed from begining and end")
lu.display_frequency_classes(targets, DOWN_PCTS_PREDICT, UP_PCTS_PREDICT)

targets_frequency = Counter(targets)
print("VALIDATE removing should be POSITIVE?")
count_remove_a= targets_frequency[INDEX_REMOVE_A] - targets_frequency[INDEX_KEEP] + targets_frequency[INDEX_KEEP] //2
count_remove_b= targets_frequency[INDEX_REMOVE_B] - targets_frequency[INDEX_KEEP] + targets_frequency[INDEX_KEEP] //2
print(f"Removing {INDEX_REMOVE_A}: {count_remove_a}")
print(f"Removing {INDEX_REMOVE_B}: {count_remove_b}")


In [None]:
###################
##### SET PARAMETERS
###################
# DECISION SET: REMOVING?
indexes_remove_a= []
# 2024-03-01 Do not remove anything
# if count_remove_a > 0:
#     indexes_remove_a = get_indexes_value(targets, index_remove_a, count_remove_a)

# DECISION SET: REMOVING?
indexes_remove_b= []
# 2024-03-01 Do not remove anything
# if count_remove_b > 0:
#     indexes_remove_b = get_indexes_value(targets, index_remove_b, count_remove_b)

indexes_remove= indexes_remove_a + indexes_remove_b
targets_clean= lu.remove_indexes(targets, indexes_remove)

lu.display_frequency_classes(targets_clean, DOWN_PCTS_PREDICT, UP_PCTS_PREDICT)
print(f"Targets len: {len(targets)} Targets clean: {len(targets_clean)} Difference: {len(targets)-len(targets_clean)}")

inputs_clean = [lu.remove_indexes(input, indexes_remove) for input in inputs]    
print(f"targets_clean positions(Keep={INDEX_KEEP})(First:{targets_clean.index(INDEX_KEEP)},Last:-{targets_clean[::-1].index(INDEX_KEEP)})")


# HERE NEW LOAD
## Method that allows to obtain multiple stock data for multiple years with a single call

In [None]:
from apis.tiingo_api import TiingoAPI
from data_sources.data_loader import DataLoader
from data_sources.tiingo_repo import TiingoRepo

tiingo_client = TiingoAPI(secret_key)
tiingo_repo= TiingoRepo(
    tiingo_client, 
    START_DATE, 
    END_DATE, 
    DATA_INTERVAL_MINUTES, 
    DATA_AFTER_HOURS,
    wait_time=5)
data_loader= DataLoader(tiingo_repo)
new_data= data_loader.load_data(TICKERS)

In [None]:
# Validates that data has been concatenated correctly = ordered ascending
if new_data[0]["date"].is_monotonic_increasing and new_data[0]["date"].is_unique:
    print("Correct: DataFrame is in ascending order.")
else:
    print("Error: DataFrame is not in ascending order.")

print(f"Data first:\n{new_data[0][['date', 'close']][:5]}")
print(f"Data last:\n{new_data[0][['date', 'close']][-5:]}")
new_data[0].to_csv('output.csv', index=False)
print(f"Len struct_data[0]: {new_data[0].shape[0]}")

duplicate_count = new_data[0]['date'].duplicated().sum()
print(f"Number of duplicate values: {duplicate_count}")

In [None]:
from preprocessing.points_features_extractor_price_time import PointsFeaturesExtractorPriceTime
from preprocessing.points_target_extractor import PointsTargetExtractor
from preprocessing.features_targets_pair import FeaturesTargetsPair

points_features_extractor = PointsFeaturesExtractorPriceTime(PRICE_AVGS)
points_target_extractor = PointsTargetExtractor(TICKS_PREDICT, REACH_PCT, DOWN_PCTS_PREDICT, UP_PCTS_PREDICT)
features_targets_pair = FeaturesTargetsPair(points_features_extractor, points_target_extractor)
features, targets = features_targets_pair.align(new_data)

In [None]:
# Sets 'index_keep' as target = 1 and rest of indexes to target=0
targets_binary= lu.convert_binary(targets, INDEX_KEEP)
print(f"targets_binary First {targets_binary.index(True)} and Last(counting from end) {targets_binary[::-1].index(True)} position with True")
print(f"targets_binary len: {len(targets_binary)} Features first: {len(features[0])} Features last: {len(features[-1])}")

In [None]:
import torch

features_tensor = torch.Tensor(features)
print(f"inputs_tensor: {features_tensor.size()}")
features_tensor = features_tensor.T
print(f"inputs_tensor: {features_tensor.size()}")
targets_tensor = torch.Tensor(targets_binary)
print(f"inputs_clean len0 x len1: {len(features)} x {len(features[0])} -> inputs_tensor.shape: {features_tensor.shape}")
print(f"targets_binary.shape: {len(targets_binary)} -> targets_tensor.shape: {targets_tensor.shape}")
print(f"inputs_tensor: {features_tensor}")
print(f"targets_tensor: {targets_tensor}")

In [None]:

features_tensor_test, targets_tensor_test= features_tensor, targets_tensor

print("Test dataset frequencies:")
lu.display_frequency_values(targets_tensor_test.tolist())



In [None]:
from typing import Tuple
import torch
from torch.utils.data import Dataset

class StockDataset(Dataset):
    def __init__(
        self,
        inputs,
        targets):
        
        self.inputs= inputs
        self.targets= targets

    def __len__(self) -> int:
        return len(self.targets)

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:
        return self.inputs[index], self.targets[index]


In [None]:
from torch.utils.data import DataLoader

# TODO: When executing only using 33-38% GPU - Try different BATCH_SIZE see if parallelism increases? Learning decreases because less batches?
BATCH_SIZE= 32

test_dataset= StockDataset(
  features_tensor_test,
  targets_tensor_test
)

print(f"First input vector:\n{test_dataset[0]}")

test_dataloader= DataLoader(
  dataset=test_dataset,
  batch_size=BATCH_SIZE,
  shuffle=False
)

test_input0, test_target0= next(iter(test_dataloader))
print(f"Dataloader batch={BATCH_SIZE}\nInput:\n{test_input0}\nTargets:\n{test_target0}")


In [None]:
# EXECUTE FROM THIS STEP To CREATE A NETWORK WITH RANDOM WEIGHTS

import torch
from torch import nn

class StockModelBinaryV0(nn.Module):
  def __init__(self, input_features, hidden_units):
    """Initializes multi-class classification model"""
    super().__init__()
    self.linear_layer_stack = nn.Sequential(
      nn.Linear(in_features=input_features, out_features=hidden_units*16),
      nn.LeakyReLU(negative_slope=0.1),
      nn.Linear(in_features=hidden_units*16, out_features=hidden_units*8),
      nn.LeakyReLU(negative_slope=0.1),
      nn.Linear(in_features=hidden_units*8, out_features=hidden_units*4),
      nn.LeakyReLU(negative_slope=0.1),
      nn.Linear(in_features=hidden_units*4, out_features=hidden_units),
      nn.LeakyReLU(negative_slope=0.1),
      nn.Linear(in_features=hidden_units, out_features=1)
    )

  def forward(self, x):
    # print("forward x: ",", ".join([str(num) for num in x.tolist()]))
    # Layers are defined inside the Sequencial NN and will be applied here.
    return self.linear_layer_stack(x)

# Create an instance of the model
model_0 = StockModelBinaryV0(
  input_features=features_tensor.shape[1],
  hidden_units=HIDDEN_UNITS).to(device)


In [None]:
# Loads model from file
from pathlib import Path

# Create directory, if it doesn't exist, to store models
MODEL_PATH= Path("models")
MODEL_PATH.mkdir(parents=True, exist_ok=True)
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

model_0.load_state_dict(torch.load(f=MODEL_SAVE_PATH))
model_0.to(device)

print(f"Test model loaded: {MODEL_NAME}")

In [None]:
from torchmetrics import ConfusionMatrix, Accuracy, Precision
from mlxtend.plotting import plot_confusion_matrix

model_0.eval()
test_precision= 0
with torch.inference_mode():
    X= features_tensor_test.to(device)
    y= targets_tensor_test.to(device)

    # Predict for test data
    test_logits= model_0(X).view(-1)
    sigmoid_output = torch.sigmoid(test_logits)
    test_pred = (sigmoid_output > TEST_THRESHOLD).float()    

confmat= ConfusionMatrix(task='binary')

# test_data.targets are the values we want to predict in the test dataloader
confmat_tensor= confmat(
  preds= test_pred.cpu(),
  target= targets_tensor_test.cpu())

# Plot confusion matrix
fig, ax= plot_confusion_matrix(
  conf_mat= confmat_tensor.numpy(),
  figsize= (10, 7)
)

accuracy_fn= Accuracy(task='binary').to(device)
test_accuracy = accuracy_fn(test_pred, y)
print(f"Test threshold: {TEST_THRESHOLD}")
print(f"Test confusion matrix:\n{confmat_tensor}")

precision_fn= Precision(task='binary').to(device)
test_precision = precision_fn(test_pred, y)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")
print(f"Test Precision: {test_precision*100:.2f}%")
false_positives = confmat_tensor[0, 1].item()
true_positives = confmat_tensor[1, 1].item()
print(f"Test false_positives: {false_positives} true_positives: {true_positives}")

In [None]:
# Example of threshold adjustment after model training
from sklearn.metrics import precision_recall_curve
import numpy as np

model_0.eval()
with torch.no_grad():
    X= features_tensor_test.to(device)
    y= targets_tensor_test.to(device)

    logits = model_0(X)
    probs = torch.sigmoid(logits).cpu().numpy()
    y_cpu = y.cpu().numpy()
    
precision, recall, thresholds = precision_recall_curve(y_cpu, probs)

# Find the threshold that gives the highest precision
optimal_idx = np.argmax(precision[:-1])
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold: ", optimal_threshold)

#### Summary

In [None]:
print("======TEST:")
print(f"Trained model: {MODEL_NAME}")
print(f"Ticker: {TICKERS}")

print(f"Data start: {new_data[0]['date'][0]} end: {new_data[0]['date'].iloc[-1]}")
print(f"Data Interval: {DATA_INTERVAL_MINUTES} - After Hours: {DATA_AFTER_HOURS}")

print(f"EWA Reach: {REACH_PCT}")
print(f"Price Averages: {PRICE_AVGS}")
print(f"points_features_extractor: {points_features_extractor.__class__.__name__}")
print(f"points_target_extractor: {points_target_extractor.__class__.__name__}")
print(f"Predict {'UP' if PREDICT_UP else 'DOWN'} - days: {DAYS_PREDICT} Down pcts: {DOWN_PCTS_PREDICT} Up pcts: {UP_PCTS_PREDICT}")

print("Targets Frequencies:")
lu.display_frequency_classes(targets, DOWN_PCTS_PREDICT, UP_PCTS_PREDICT)

print("Test dataset frequencies:")
lu.display_frequency_values(targets_tensor_test.tolist())

print("--Inference")
print(f"Network hidden units: {HIDDEN_UNITS}")
print(f"== Test threshold: {TEST_THRESHOLD}")

print("--Inference Results")
print(f"Test confusion matrix:\n{confmat_tensor}")
print(f"Test accuracy: {test_accuracy*100:.2f}%")
print(f"Test precision: {test_precision*100:.2f}%")
print(f"Test false_positives: {false_positives} true_positives: {true_positives}")
