In [45]:
from utils.returns_data_class import ReturnsData
from utils.pointwise_context import get_contextual_indices

# Constants and configurations
PERIODS = [1, 5, 10]
TRAIN_PCT = 1
LOAD_IDX_COMBINATIONS = False
EPOCHS = 3
EMBEDDING_DIM = 20
CONTEXT_SIZE = 32
SAVE_MODEL = False
SAVE_PATH_TEMPLATE = "embeddings/abs_diff_E{epochs}_C{context_size}_D{embedding_dim}_P{periods}_train{train_pct}.pt"
save_path = SAVE_PATH_TEMPLATE.format(
    epochs=EPOCHS,
    context_size=CONTEXT_SIZE,
    embedding_dim=EMBEDDING_DIM,
    periods="-".join(map(str, PERIODS)),
    train_pct=TRAIN_PCT,
)

idx_combinations = []
for period in PERIODS:
    data = ReturnsData(
        daily_returns_path="Data/returns_df_611.csv",
        extras_path="Data/historical_stocks.csv",
    )
    data.change_returns_period(period)
    data.train_test_split(TRAIN_PCT)

    idx_combinations += get_contextual_indices(
        data.train_returns_df,
        context_size=CONTEXT_SIZE,
        verbose=True,
        iqr_noise_reduction=True,
    )

No change made because period entered is 1
16 627 152.75 458.25


100%|██████████| 611/611 [00:07<00:00, 85.72it/s] 


16 627 152.75 458.25


100%|██████████| 611/611 [00:01<00:00, 477.71it/s]


16 627 152.75 458.25


100%|██████████| 611/611 [00:00<00:00, 1556.14it/s]


In [50]:
i = data.ticker2idx["JPM"]
import numpy as np

temp = np.array([xi[1] for xi in idx_combinations if xi[0] == i]).flatten()
import pandas as pd

[data.idx2ticker[xi] for xi in pd.Series(temp).value_counts().index][:10]

['C', 'BAC', 'STI', 'WFC', 'PNC', 'HBAN', 'ZION', 'STT', 'KEY', 'CMA']

In [46]:
import torch
import torch.nn as nn
from models.base_model import BaseModel


class ClassificationEmbeddings(BaseModel):
    """
    Model architecture similar to CBOW Word2Vec but adapted for stock modelling.
    """

    def __init__(self, n_time_series: int, embedding_dim: int):
        super(ClassificationEmbeddings, self).__init__()
        self.embeddings = nn.Embedding(n_time_series, embedding_dim)

    def forward(self, inputs):
        # -- This extracts the relevant rows of the embedding matrix
        # - Equivalent to W^T x_i in "word2vec Parameter Learning Explained"
        temp = self.embeddings(inputs)  # .view((len(inputs),-1))

        # -- Compute the hidden layer by a simple mean
        hidden = temp.mean(axis=1)
        # -- Reshape to make matrix dimensions compatible
        hidden = hidden.unsqueeze(dim=2)
        # -- Compute dot product of hidden with embeddings
        # out = torch.einsum("nd,", self.embeddings.weight, hidden)
        out = torch.matmul(self.embeddings.weight, hidden)

        # -- Return the log softmax since we use NLLLoss loss function
        return nn.functional.log_softmax(out, dim=1)

In [47]:
model = ClassificationEmbeddings(
    n_time_series=len(data.tickers), embedding_dim=EMBEDDING_DIM
)

In [48]:
from utils.training_helpers import train_embeddings_from_idx_combinations

model, losses = train_embeddings_from_idx_combinations(
    n_time_series=len(data.tickers),
    idx_combinations=idx_combinations,
    model=model,
    epochs=20,
    # embedding_dim=EMBEDDING_DIM,
    verbose=True,
)

Training embeddings...


  5%|▌         | 1/20 [00:40<12:58, 40.96s/it]

Epoch 0: Loss = 0.10374109499065473


 10%|█         | 2/20 [01:21<12:14, 40.83s/it]

Epoch 1: Loss = 0.10121887932745742


 15%|█▌        | 3/20 [02:02<11:32, 40.76s/it]

Epoch 2: Loss = 0.0995641483254468


 20%|██        | 4/20 [02:43<10:51, 40.74s/it]

Epoch 3: Loss = 0.09868489761125318


 25%|██▌       | 5/20 [03:23<10:11, 40.76s/it]

Epoch 4: Loss = 0.09819312226299613


 30%|███       | 6/20 [04:05<09:32, 40.89s/it]

Epoch 5: Loss = 0.09788414273028445


 35%|███▌      | 7/20 [04:46<08:52, 40.97s/it]

Epoch 6: Loss = 0.0976704512347089


 35%|███▌      | 7/20 [05:27<10:07, 46.76s/it]

Epoch 7: Loss = 0.09751443594345513
Early stopping at epoch 7 due to minimal loss reduction.





In [53]:
from utils.sector_classification import get_sector_score

get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors)

Precision Score: 0.6
Recall Score: 0.58
F1 Score: 0.59
Accuracy Score: 0.58


  _warn_prf(average, modifier, msg_start, len(result))
