In [15]:
from utils.returns_data_class import ReturnsData
from utils.window_context import get_target_context_sets, Euclidean, ICCBRMetric

# Constants and configurations
TRAIN_PCT = 1
CONTEXT_SIZE = 32
GRANULARITY = 1 # 1=daily, 5=weekly, etc.
WINDOW_LENGTHS = [20]
STRIDE = None

data = ReturnsData(
    daily_returns_path="Data/returns_df_611.csv",
    extras_path="Data/historical_stocks.csv",
)

data = ReturnsData(
        daily_returns_path="Data/returns_df_611.csv",
        extras_path="Data/historical_stocks.csv",
    )
data.change_returns_period(GRANULARITY)
data.train_test_split(TRAIN_PCT)
    
tgt_context_sets = []
for window_length in WINDOW_LENGTHS:
    print(f"Running for period {window_length}")
    
    tgt_context_sets += get_target_context_sets(
        X=data.returns_df.values.T,
        metric_class=ICCBRMetric(),
        window_length=window_length,
        stride=STRIDE,
        context_size=32,
        verbose=True,
    )
    print("="*20)
print(f"Total number of target context sets: {len(tgt_context_sets)}")

No change made because period entered is 1
Running for period 20


100%|██████████| 234/234 [16:27<00:00,  4.22s/it]

Total number of target context sets: 142974





In [16]:
import pandas as pd
import numpy as np

ticker = "JPM"
i = data.ticker2idx[ticker]
temp = np.array([xi[1] for xi in tgt_context_sets if xi[0] == i]).flatten()
print(
    f"The most commonly co-occurring stocks with {ticker} are: {[data.idx2ticker[xi] for xi in pd.Series(temp).value_counts().index][:5]}"
)

The most commonly co-occurring stocks with JPM are: ['C', 'BAC', 'WFC', 'USB', 'PNC']


In [17]:
from models.embedding_models import ClassificationEmbeddings

EMBEDDING_DIM = 20
model = ClassificationEmbeddings(
    n_time_series=len(data.tickers), embedding_dim=EMBEDDING_DIM
)

In [20]:
from utils.training_helpers import train_embeddings_from_tgt_context_sets


EPOCHS = 10
model, losses = train_embeddings_from_tgt_context_sets(
    n_time_series=len(data.tickers),
    tgt_context_sets=tgt_context_sets,
    model=model,
    epochs=EPOCHS,
    batch_size=64,
    early_stopping=False,
    device="cpu",
    # embedding_dim=EMBEDDING_DIM,
    verbose=True,
)

Training embeddings...


 10%|█         | 1/10 [00:03<00:31,  3.52s/it]

Epoch 0: Loss = 0.1053209229102596


 20%|██        | 2/10 [00:06<00:27,  3.39s/it]

Epoch 1: Loss = 0.10465304311273133


 30%|███       | 3/10 [00:10<00:23,  3.41s/it]

Epoch 2: Loss = 0.10399851166262415


 40%|████      | 4/10 [00:13<00:20,  3.40s/it]

Epoch 3: Loss = 0.10333443982914538


 50%|█████     | 5/10 [00:17<00:17,  3.43s/it]

Epoch 4: Loss = 0.10264748797000815


 60%|██████    | 6/10 [00:20<00:13,  3.37s/it]

Epoch 5: Loss = 0.10193282356790187


 70%|███████   | 7/10 [00:23<00:10,  3.35s/it]

Epoch 6: Loss = 0.10118863349918074


 80%|████████  | 8/10 [00:26<00:06,  3.31s/it]

Epoch 7: Loss = 0.1004202896391283


 90%|█████████ | 9/10 [00:30<00:03,  3.30s/it]

Epoch 8: Loss = 0.09963655647305805


100%|██████████| 10/10 [00:33<00:00,  3.34s/it]

Epoch 9: Loss = 0.09884889987729026





In [22]:
from utils.sector_classification import get_sector_score

get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors)

  _warn_prf(average, modifier, msg_start, len(result))


Precision Score: 0.27
Recall Score: 0.28
F1 Score: 0.27
Accuracy Score: 0.28


In [None]:
SAVE_MODEL = False
SAVE_PATH_TEMPLATE = "embeddings/abs_diff_E{epochs}_C{context_size}_D{embedding_dim}_P{periods}_train{train_pct}.pt"