### This Notebooks Contains the Code For Learning Stock Embeddings
This approach was described in:
- The original preprint: [Stock Embeddings: Learning Distributed Representations for Financial Assets](https://arxiv.org/pdf/2202.08968.pdf)
- The more updated conference publication which can be found [here](https://www.mdpi.com/2673-4591/39/1/30/pdf?version=1695109017)

For any questions, reach out to me at dolphrian@gmail.com

In [1]:
from utils.returns_data_class import ReturnsData
from utils.pointwise_context import get_pointwise_tgt_context_sets

# Constants and configurations
PERIODS = [1, 5, 10]
TRAIN_PCT = 1
CONTEXT_SIZE = 32

tgt_context_sets = []
for period in PERIODS:
    print(f"Running for period {period}")
    data = ReturnsData(
        daily_returns_path="Data/returns_df_611.csv",
        extras_path="Data/historical_stocks.csv",
    )
    data.change_returns_period(period)
    data.train_test_split(TRAIN_PCT)

    tgt_context_sets += get_pointwise_tgt_context_sets(
        data.train_returns_df,
        context_size=CONTEXT_SIZE,
        verbose=True,
        iqr_noise_reduction=True,
    )
    print("="*20)
print(f"Total number of target context sets: {len(tgt_context_sets)}")

Running for period 1
No change made because period entered is 1


100%|██████████| 611/611 [00:05<00:00, 108.05it/s]


Running for period 5


100%|██████████| 611/611 [00:01<00:00, 449.64it/s] 


Running for period 10


100%|██████████| 611/611 [00:00<00:00, 1846.59it/s]

Total number of target context sets: 1860195





In [2]:
import pandas as pd
import numpy as np

ticker = "JPM"
i = data.ticker2idx[ticker]
temp = np.array([xi[1] for xi in tgt_context_sets if xi[0] == i]).flatten()
print(
    f"The most commonly co-occurring stocks with {ticker} are: {[data.idx2ticker[xi] for xi in pd.Series(temp).value_counts().index][:5]}"
)

The most commonly co-occurring stocks with JPM are: ['C', 'BAC', 'STI', 'WFC', 'PNC']


In [3]:
from models.embedding_models import ClassificationEmbeddings

EMBEDDING_DIM = 20
model = ClassificationEmbeddings(
    n_time_series=len(data.tickers), embedding_dim=EMBEDDING_DIM
)

In [4]:
from utils.classifier_training_helpers import train_embeddings_from_tgt_context_sets

EPOCHS = 10

model, losses = train_embeddings_from_tgt_context_sets(
    n_time_series=len(data.tickers),
    tgt_context_sets=tgt_context_sets,
    model=model,
    epochs=EPOCHS,
    batch_size=64,
    early_stopping=False,
    device="cpu",
    # embedding_dim=EMBEDDING_DIM,
    verbose=True,
)

Training embeddings...


 10%|█         | 1/10 [00:47<07:06, 47.35s/it]

Epoch 0: Loss = 0.10349648851070103


 20%|██        | 2/10 [01:34<06:17, 47.14s/it]

Epoch 1: Loss = 0.10100494772452158


 30%|███       | 3/10 [02:22<05:34, 47.81s/it]

Epoch 2: Loss = 0.09943320442528768


 40%|████      | 4/10 [03:10<04:45, 47.65s/it]

Epoch 3: Loss = 0.09860378110160643


 50%|█████     | 5/10 [03:57<03:56, 47.35s/it]

Epoch 4: Loss = 0.098134168269875


 60%|██████    | 6/10 [04:43<03:07, 46.87s/it]

Epoch 5: Loss = 0.09783325901406574


 70%|███████   | 7/10 [05:32<02:22, 47.61s/it]

Epoch 6: Loss = 0.09762726479700054


 80%|████████  | 8/10 [06:21<01:36, 48.02s/it]

Epoch 7: Loss = 0.09748321516561402


 90%|█████████ | 9/10 [07:08<00:47, 47.70s/it]

Epoch 8: Loss = 0.09738020642517808


100%|██████████| 10/10 [07:55<00:00, 47.54s/it]

Epoch 9: Loss = 0.09730453273448397





In [7]:
from utils.sector_classification import get_sector_score

get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors, top_k_accuracy=True)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



Precision Score: 0.65
Recall Score: 0.62
F1 Score: 0.63
Accuracy Score: 0.62
Accuracy Score Top-3: 0.84


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
from utils.visualisation_functions import pca_plot_from_embeddings

pca_plot_from_embeddings(
    embedding_matrix=model.embeddings.weight.detach().numpy(),
    sectors=data.sectors,
    tickers=data.tickers,
    industries=data.industries,
    names=data.names,
    dimensions=2,
    reduced=True,
    method="PCA",
    return_df=False,
    rand_state=None,
)

In [3]:
SAVE_MODEL = False
SAVE_PATH_TEMPLATE = "abs_diff_E{epochs}_C{context_size}_D{embedding_dim}_P{periods}_train{train_pct}.pt"
save_path = SAVE_PATH_TEMPLATE.format(
    epochs=EPOCHS,
    context_size=CONTEXT_SIZE,
    embedding_dim=EMBEDDING_DIM,
    periods="-".join(map(str, PERIODS)),
    train_pct=TRAIN_PCT,
)

# Example usage of saving and loading model
# model.save_model("model_test.pt")
# model = ClassificationEmbeddings.load_model(path="model_test.pt")