In [1]:
from utils.returns_data_class import ReturnsData
from utils.pointwise_context import get_pointwise_tgt_context_sets

# Constants and configurations
PERIODS = [1, 5, 10]
TRAIN_PCT = 1
CONTEXT_SIZE = 32

tgt_context_sets = []
for period in PERIODS:
    print(f"Running for period {period}")
    data = ReturnsData(
        daily_returns_path="Data/returns_df_611.csv",
        extras_path="Data/historical_stocks.csv",
    )
    data.change_returns_period(period)
    data.train_test_split(TRAIN_PCT)

    tgt_context_sets += get_pointwise_tgt_context_sets(
        data.train_returns_df,
        context_size=CONTEXT_SIZE,
        verbose=True,
        iqr_noise_reduction=True,
    )
    print("="*20)
print(f"Total number of target context sets: {len(tgt_context_sets)}")

Running for period 1
No change made because period entered is 1


100%|██████████| 611/611 [00:05<00:00, 107.21it/s]


Running for period 5


100%|██████████| 611/611 [00:01<00:00, 501.73it/s] 


Running for period 10


100%|██████████| 611/611 [00:00<00:00, 1594.46it/s]

Total number of target context sets: 1860195





In [2]:
import pandas as pd
import numpy as np

ticker = "JPM"
i = data.ticker2idx[ticker]
temp = np.array([xi[1] for xi in tgt_context_sets if xi[0] == i]).flatten()
print(
    f"The most commonly co-occurring stocks with {ticker} are: {[data.idx2ticker[xi] for xi in pd.Series(temp).value_counts().index][:5]}"
)

The most commonly co-occurring stocks with JPM are: ['C', 'BAC', 'STI', 'WFC', 'PNC']


In [3]:
from models.embedding_models import ClassificationEmbeddings

EMBEDDING_DIM = 20
model = ClassificationEmbeddings(
    n_time_series=len(data.tickers), embedding_dim=EMBEDDING_DIM
)

In [7]:
from utils.training_helpers import train_embeddings_from_tgt_context_sets

EPOCHS = 15
model, losses = train_embeddings_from_tgt_context_sets(
    n_time_series=len(data.tickers),
    tgt_context_sets=tgt_context_sets,
    model=model,
    epochs=EPOCHS,
    batch_size=64,
    early_stopping=False,
    device="cpu",
    # embedding_dim=EMBEDDING_DIM,
    verbose=True,
)

Training embeddings...


  7%|▋         | 1/15 [00:47<11:02, 47.34s/it]

Epoch 0: Loss = 0.0978617759630066


 13%|█▎        | 2/15 [01:34<10:15, 47.32s/it]

Epoch 1: Loss = 0.09764747479845606


 20%|██        | 3/15 [02:21<09:25, 47.14s/it]

Epoch 2: Loss = 0.09749775905474192


 27%|██▋       | 4/15 [03:09<08:43, 47.62s/it]

Epoch 3: Loss = 0.0973901599122677


 33%|███▎      | 5/15 [04:02<08:13, 49.33s/it]

Epoch 4: Loss = 0.09731013429681672


 40%|████      | 6/15 [04:55<07:34, 50.54s/it]

Epoch 5: Loss = 0.09724889776340845


 47%|████▋     | 7/15 [05:44<06:42, 50.30s/it]

Epoch 6: Loss = 0.09720112957853716


 53%|█████▎    | 8/15 [06:36<05:54, 50.70s/it]

Epoch 7: Loss = 0.09716317890157147


 60%|██████    | 9/15 [07:32<05:14, 52.39s/it]

Epoch 8: Loss = 0.09713257643450728


 67%|██████▋   | 10/15 [08:25<04:21, 52.39s/it]

Epoch 9: Loss = 0.09710746751289123


 73%|███████▎  | 11/15 [09:14<03:25, 51.48s/it]

Epoch 10: Loss = 0.09708641883958008


 80%|████████  | 12/15 [10:04<02:33, 51.09s/it]

Epoch 11: Loss = 0.09706870245529538


 87%|████████▋ | 13/15 [10:56<01:42, 51.21s/it]

Epoch 12: Loss = 0.09705345810982358


 93%|█████████▎| 14/15 [11:48<00:51, 51.59s/it]

Epoch 13: Loss = 0.09704018618566083


100%|██████████| 15/15 [12:37<00:00, 50.51s/it]

Epoch 14: Loss = 0.09702865603010932





In [11]:
from utils.sector_classification import get_sector_score

get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors)

Precision Score: 0.7
Recall Score: 0.68
F1 Score: 0.69
Accuracy Score: 0.68
Accuracy Score: 0.68


In [None]:
SAVE_MODEL = False
SAVE_PATH_TEMPLATE = "embeddings/abs_diff_E{epochs}_C{context_size}_D{embedding_dim}_P{periods}_train{train_pct}.pt"
save_path = SAVE_PATH_TEMPLATE.format(
    epochs=EPOCHS,
    context_size=CONTEXT_SIZE,
    embedding_dim=EMBEDDING_DIM,
    periods="-".join(map(str, PERIODS)),
    train_pct=TRAIN_PCT,
)