### Matrix Factorization of Correlation Matrix

Simple baseline factorizing the correlation matrix.

In [8]:
import sys
sys.path.append('../')

In [19]:
from utils.returns_data_class import ReturnsData
data = ReturnsData(daily_returns_path="../Data/returns_df_611.csv",
                   extras_path="../Data/historical_stocks.csv",)
data.change_returns_period(period=22)
X = data.returns_df.T.values

In [20]:
from models.embedding_models import MatrixFactorization
# correlations = torch.tensor(data.returns_df.corr().values)
correlations = MatrixFactorization.get_correlation_matrix(X=X, scaled=False)
# correlations = MatrixFactorization.apply_IQR_mask(correlations, percentiles=(0.25,0.75))
# correlations = MatrixFactorization.apply_random_mask(correlations, percentage=0.9)

In [21]:
model, losses, learning_rates = MatrixFactorization.train_MF_model(
    n_time_series=len(data.tickers),
    similarity_matrix=correlations,
    embedding_dim=20,
    learning_rate=0.2,
    epochs=300,
    regularization_loss_weight=0.1,
    pairwise_loss_weight=0.01,
    verbose=True,
)

100%|██████████| 300/300 [00:02<00:00, 131.58it/s]


In [22]:

MatrixFactorization.plot_embedding_training(losses, learning_rates, verbose=True)

Final pairwise_loss: 91.32216292233423


In [23]:
from utils.sector_classification import get_sector_score

get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors, top_k_accuracy=True)

Precision Score: 0.61
Recall Score: 0.59
F1 Score: 0.59
Accuracy Score: 0.59
Accuracy Score Top-3: 0.81


In [24]:
from utils.visualisation_functions import pca_plot_from_embeddings

pca_plot_from_embeddings(
    embedding_matrix=model.embeddings.weight.detach().numpy(),
    sectors=data.sectors,
    tickers=data.tickers,
    industries=data.industries,
    names=data.names,
    dimensions=2,
    reduced=True,
    method="PCA",
    return_df=False,
    rand_state=None,
)

In [25]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
# classifier = KNeighborsClassifier(n_neighbors=1, metric='euclidean')

# embeddings = model.embeddings.weight.detach().numpy()
embeddings = data.daily_returns_df.values.T
train_size = int(len(data.sectors)*0.7)
X_train = embeddings[:train_size, :]
X_test = embeddings[train_size:, :]
y_train = data.sectors[:train_size]
y_test = data.sectors[train_size:]

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from utils.ucr_helpers import evaluate_resampling_UCR
report, _, _ = evaluate_resampling_UCR(
    X_train,
    X_test,
    y_train,
    y_test,
    # classifier=SVC(kernel="rbf"),
    # classifier=LogisticRegression(),
    classifier=KNeighborsClassifier(n_neighbors=5),
    # classifier=MLPClassifier(),
    n_resamples=20,
    verbose=True,
    scale=False,
    over_sampling=True
)
report

100%|██████████| 19/19 [00:02<00:00,  6.42it/s]


{'precision': 0.66923766510648,
 'recall': 0.6035326086956521,
 'f1-score': 0.5982152359205826,
 'accuracy': 0.6035326086956521}