In [1]:
from utils.returns_data_class import ReturnsData
from utils.pointwise_context import get_contextual_indices

# Constants and configurations
PERIODS = [1, 5, 10]
TRAIN_PCT = 1
CONTEXT_SIZE = 32

tgt_context_sets = []
for period in PERIODS:
    data = ReturnsData(
        daily_returns_path="Data/returns_df_611.csv",
        extras_path="Data/historical_stocks.csv",
    )
    data.change_returns_period(period)
    data.train_test_split(TRAIN_PCT)

    tgt_context_sets += get_contextual_indices(
        data.train_returns_df,
        context_size=CONTEXT_SIZE,
        verbose=True,
        iqr_noise_reduction=True,
    )
print(f"Number of target context sets: {len(tgt_context_sets)}")

No change made because period entered is 1


100%|██████████| 611/611 [00:05<00:00, 108.83it/s]
100%|██████████| 611/611 [00:01<00:00, 478.52it/s] 
100%|██████████| 611/611 [00:00<00:00, 1749.53it/s]


In [2]:
import pandas as pd
import numpy as np
ticker = "JPM"
i = data.ticker2idx[ticker]
temp = np.array([xi[1] for xi in tgt_context_sets if xi[0]==i]).flatten()
print(f"The most commonly co-occurring stocks with JPM are: {[data.idx2ticker[xi] for xi in pd.Series(temp).value_counts().index][:5]}")

Top stocks most commonly cooccurring:


['C', 'BAC', 'STI', 'WFC', 'PNC', 'HBAN', 'ZION', 'STT', 'KEY', 'CMA']

In [3]:
from models.embedding_models import ClassificationEmbeddings
EMBEDDING_DIM = 20
model = ClassificationEmbeddings(
    n_time_series=len(data.tickers), embedding_dim=EMBEDDING_DIM
)

In [4]:
from utils.training_helpers import train_embeddings_from_tgt_context_sets
EPOCHS = 20
model, losses = train_embeddings_from_tgt_context_sets(
    n_time_series=len(data.tickers),
    tgt_context_sets=tgt_context_sets,
    model=model,
    epochs=EPOCHS,
    # embedding_dim=EMBEDDING_DIM,
    verbose=True,
)

Training embeddings...


  5%|▌         | 1/20 [00:45<14:33, 45.98s/it]

Epoch 0: Loss = 0.10381229708691216


 10%|█         | 2/20 [01:31<13:46, 45.94s/it]

Epoch 1: Loss = 0.10121270574125595


 15%|█▌        | 3/20 [02:22<13:37, 48.11s/it]

Epoch 2: Loss = 0.09957956481662815


 20%|██        | 4/20 [03:09<12:39, 47.44s/it]

Epoch 3: Loss = 0.09869415337357938


 25%|██▌       | 5/20 [03:55<11:45, 47.02s/it]

Epoch 4: Loss = 0.09817352312788913


 30%|███       | 6/20 [04:41<10:55, 46.79s/it]

Epoch 5: Loss = 0.09784624309921643


 35%|███▌      | 7/20 [05:28<10:07, 46.73s/it]

Epoch 6: Loss = 0.0976350455015062


 35%|███▌      | 7/20 [06:14<11:35, 53.53s/it]

Epoch 7: Loss = 0.09749177410088655
Early stopping at epoch 7 due to minimal loss reduction.





In [6]:
from utils.sector_classification import get_sector_score

get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors)

Precision Score: 0.64
Recall Score: 0.61
F1 Score: 0.61
Accuracy Score: 0.61
Accuracy Score: 0.61


In [None]:
SAVE_MODEL = False
SAVE_PATH_TEMPLATE = "embeddings/abs_diff_E{epochs}_C{context_size}_D{embedding_dim}_P{periods}_train{train_pct}.pt"
save_path = SAVE_PATH_TEMPLATE.format(
    epochs=EPOCHS,
    context_size=CONTEXT_SIZE,
    embedding_dim=EMBEDDING_DIM,
    periods="-".join(map(str, PERIODS)),
    train_pct=TRAIN_PCT,
)