### Matrix Factorization of Correlation Matrix

Simple baseline factorizing the correlation matrix.

In [1]:
from utils.returns_data_class import ReturnsData
data = ReturnsData(daily_returns_path="Data/returns_df_611.csv",
                   extras_path="Data/historical_stocks.csv",)

In [2]:
import torch
correlations = torch.tensor(data.returns_df.corr().values)

In [3]:
from models.embedding_models import MatrixFactorization
model = MatrixFactorization(n_time_series=len(data.tickers), embedding_dim=20, normalize=True)

In [4]:
import torch
from tqdm import tqdm


model = MatrixFactorization(
    n_time_series=len(data.tickers), embedding_dim=20, normalize=True
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)

losses = []
learning_rates = []
regularization_loss_weight = 0.1
pairwise_loss_weight = 0.01
for epoch in tqdm(range(100)):
    optimizer.zero_grad()
    pairwise_embedding_similarities = model()
    pairwise_loss = pairwise_loss_weight * model.calculate_loss(
        correlations,
        pairwise_embedding_similarities,
        loss_function=torch.nn.functional.l1_loss,
    )
    regularization_loss = (
        regularization_loss_weight
        * torch.abs(torch.linalg.norm(model.embeddings.weight, dim=1) - 1).sum()
    )
    loss = pairwise_loss + regularization_loss
    loss.backward()
    optimizer.step()

    scheduler.step()

    losses.append((loss.item(), pairwise_loss.item(), regularization_loss.item()))
    learning_rates.append(optimizer.param_groups[0]['lr'])


100%|██████████| 100/100 [00:00<00:00, 127.32it/s]


In [5]:
from utils.sector_classification import get_sector_score

get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors, top_k_accuracy=True)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



Precision Score: 0.73
Recall Score: 0.68
F1 Score: 0.69
Accuracy Score: 0.68
Accuracy Score Top-3: 0.87


In [6]:
import plotly.graph_objects as go

# Unpack the losses
total_losses, pairwise_losses, regularization_losses = zip(*losses)

# Create a figure
fig = go.Figure()

# Add traces for pairwise and regularization losses
fig.add_trace(go.Scatter(x=list(range(len(total_losses))), y=pairwise_losses, mode='lines', name='Total Loss'))
fig.add_trace(go.Scatter(x=list(range(len(pairwise_losses))), y=pairwise_losses, mode='lines', name='Pairwise Loss'))
fig.add_trace(go.Scatter(x=list(range(len(regularization_losses))), y=regularization_losses, mode='lines', name='Regularization Loss'))

# Create a secondary y-axis for the total loss
fig.update_layout(
    yaxis=dict(title='Pairwise and Regularization Loss'),
    yaxis2=dict(title='Learning Rate', overlaying='y', side='right')
)

# Add the total loss trace
fig.add_trace(go.Scatter(x=list(range(len(learning_rates))), y=learning_rates, mode='lines', name='Learning rate', yaxis='y2'))

# Update layout
fig.update_layout(title='Losses During Training', xaxis_title='Epoch', yaxis_title='Loss')
fig.update_layout(template='plotly_white')

# Show the figure
fig.show()


In [7]:
from utils.visualisation_functions import pca_plot_from_embeddings

pca_plot_from_embeddings(
    embedding_matrix=model.embeddings.weight.detach().numpy(),
    sectors=data.sectors,
    tickers=data.tickers,
    industries=data.industries,
    names=data.names,
    dimensions=2,
    reduced=True,
    method="PCA",
    return_df=False,
    rand_state=None,
)