### Triplet Contrastive Loss
https://arxiv.org/pdf/1503.03832.pdf

Simple triplet margin loss implementation as a baseline. 
- Anchor with single positive and negative.
- Positive is most similar 
- Negative is least similar

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')

In [2]:
from utils.returns_data_class import ReturnsData
PERIOD = 20
data = ReturnsData(
    daily_returns_path="../Data/returns_df_611.csv",
    extras_path="../Data/historical_stocks.csv",
)
data.change_returns_period(PERIOD)
X = data.returns_df.values.T

In [3]:
X.shape

(611, 234)

In [4]:
num_TS = X.shape[0]
num_pos_samples = 1
period = 10
stride = 3
num_neg_samples = 1

print(f"Context Size: {num_pos_samples}, Period: {period}, Stride: {stride}")
print(f"Number of Negative Samples: {num_neg_samples}")

from utils.context import get_tgt_context_euclidean_multiprocess
positive_tgt_context_sets = get_tgt_context_euclidean_multiprocess(ts_array=X, m=period, k=num_pos_samples, stride=stride, z_normalize=False, verbose=False)
negative_tgt_context_sets = get_tgt_context_euclidean_multiprocess(ts_array=X, m=period, k=num_neg_samples, stride=stride, z_normalize=False, top_k=False, verbose=False)
print(f"Number (anchor, positive, negative) samples: {len(positive_tgt_context_sets)}")

Context Size: 1, Period: 10, Stride: 3
Number of Negative Samples: 1
Number (anchor, positive, negative) samples: 45825


In [5]:
index_samples = []
for pos, neg in zip(positive_tgt_context_sets,negative_tgt_context_sets):
    index_samples.append((pos[0], pos[1][0], neg[1][0]))

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Parameters
num_items = num_TS  # Number of items in your dataset
embedding_size = 16  # Size of each embedding
learning_rate = 0.001
epochs = 100
batch_size = 128

# Initialize the embedding matrix
embeddings = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_size)

# Triplet margin loss
triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)

# Optimizer
optimizer = optim.Adam(embeddings.parameters(), lr=learning_rate)

# Prepare your index_samples as a tensor
# index_samples = torch.tensor([(35, 12, 98), (47, 12, 4), ...])  # Your index tuples as a tensor
# DataLoader for batching
dataset = TensorDataset(torch.tensor(index_samples))
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop with batching

def train(embeddings, loss_fn, optimizer, data_loader, epochs=10):
    for epoch in range(epochs):
        total_loss = 0

        for batch in data_loader:
            # Get the embeddings for anchor, positive, and negative
            batch_embeddings = embeddings(batch[0]) # shape: (batch_size, 3, embedding_dim)
            anchor_embedding = batch_embeddings[:,0,:]
            positive_embedding = batch_embeddings[:,1,:]
            negative_embedding = batch_embeddings[:,2,:]

            # Compute the loss
            loss = loss_fn(anchor_embedding, positive_embedding, negative_embedding)
            total_loss += loss.item()

            # Backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if (epoch % 10 == 0) | (epoch==epochs-1):
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(data_loader)}')

# Example index_samples
# index_samples = [(35, 12, 98), (47, 12, 4), ...]  # Your index tuples

# Call the train function
train(embeddings, triplet_loss, optimizer, data_loader, epochs)


Epoch [1/100], Loss: 0.9392370998361317
Epoch [11/100], Loss: 0.03783299200610364
Epoch [21/100], Loss: 0.01876918989179194
Epoch [31/100], Loss: 0.010635725401528698
Epoch [41/100], Loss: 0.006544097941524471
Epoch [51/100], Loss: 0.004282156716316191
Epoch [61/100], Loss: 0.0028779301948035994
Epoch [71/100], Loss: 0.005030844993578026
Epoch [81/100], Loss: 0.0013490058481693268
Epoch [91/100], Loss: 0.0009142640085688541
Epoch [100/100], Loss: 0.0006729115246896292


### Get training data

In [10]:
from utils.sector_classification import get_sector_score

get_sector_score(embeddings.weight.detach().numpy(), sectors=data.sectors, top_k_accuracy=True)

Precision Score: 0.13
Recall Score: 0.12
F1 Score: 0.12
Accuracy Score: 0.12
Accuracy Score Top-3: 0.4


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
from utils.visualisation_functions import pca_plot_from_embeddings

pca_plot_from_embeddings(
    embedding_matrix=embeddings.weight.detach().numpy(),
    sectors=data.sectors,
    tickers=data.tickers,
    industries=data.industries,
    names=data.names,
    dimensions=2,
    reduced=True,
    method="PCA",
    return_df=False,
    rand_state=None,
)