### Matrix Factorization of Correlation Matrix

Simple baseline factorizing the correlation matrix.

In [1]:
import sys
sys.path.append('../')

In [2]:
from utils.returns_data_class import ReturnsData
data = ReturnsData(daily_returns_path="../Data/returns_df_611.csv",
                   extras_path="../Data/historical_stocks.csv",)

In [3]:
from models.embedding_models import MatrixFactorization
# correlations = torch.tensor(data.returns_df.corr().values)
correlations = MatrixFactorization.get_correlation_matrix(X=data.daily_returns_df.T.values, scaled=False)
# correlations = MatrixFactorization.apply_IQR_mask(correlations, percentiles=(0.25,0.75))
# correlations = MatrixFactorization.apply_random_mask(correlations, percentage=0.9)

In [5]:
import numpy as np
from sklearn.decomposition import PCA, NMF
from sklearn.manifold import TSNE
def get_baseline_embeddings(similarity_matrix, num_dimensions, method='SVD'):
    """
    Generate embeddings for each entity from a square symmetric similarity matrix using various methods.

    :param similarity_matrix: A square symmetric matrix where element (i, j) is the similarity between entity i and entity j.
    :param num_dimensions: The number of dimensions for the embeddings (must be <= min(matrix.shape)).
    :param method: The matrix factorization method to use ('SVD', 'PCA', 'NMF', 't-SNE').
    :return: A numpy array of shape (num_entities, num_dimensions) representing the embeddings.
    """
    if method == 'SVD':
        U, Sigma, VT = np.linalg.svd(similarity_matrix)
        reduced_U = U[:, :num_dimensions]
        reduced_Sigma = np.diag(Sigma[:num_dimensions])
        embeddings = np.dot(reduced_U, reduced_Sigma)

    elif method == 'PCA':
        pca = PCA(n_components=num_dimensions)
        embeddings = pca.fit_transform(similarity_matrix)

    elif method == 'NMF':
        model = NMF(n_components=num_dimensions, init='random', random_state=0)
        W = model.fit_transform(similarity_matrix)
        embeddings = W

    elif method == 't-SNE':
        tsne = TSNE(n_components=num_dimensions, learning_rate='auto', init='random')
        embeddings = tsne.fit_transform(similarity_matrix)

    else:
        raise ValueError("Invalid method. Choose from 'SVD', 'PCA', 'NMF', 't-SNE'.")

    return embeddings

embeddings = get_baseline_embeddings(correlations.fill_diagonal_(1), 20, "SVD")

from utils.sector_classification import get_sector_score

get_sector_score(embeddings, sectors=data.sectors, top_k_accuracy=True)

Precision Score: 0.63
Recall Score: 0.52
F1 Score: 0.52
Accuracy Score: 0.52
Accuracy Score Top-3: 0.82


In [6]:
from utils.visualisation_functions import pca_plot_from_embeddings

pca_plot_from_embeddings(
    embedding_matrix=embeddings,
    sectors=data.sectors,
    tickers=data.tickers,
    industries=data.industries,
    names=data.names,
    dimensions=2,
    reduced=True,
    method="PCA",
    return_df=False,
    rand_state=None,
)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

train_size = int(len(data.sectors)*0.7)
X_train = embeddings[:train_size, :]
X_test = embeddings[train_size:, :]
y_train = data.sectors[:train_size]
y_test = data.sectors[train_size:]

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from utils.ucr_helpers import evaluate_resampling_UCR
report, _, _ = evaluate_resampling_UCR(
    X_train,
    X_test,
    y_train,
    y_test,
    # classifier=SVC(kernel="rbf"),
    # classifier=LogisticRegression(),
    classifier=KNeighborsClassifier(n_neighbors=1),
    # classifier=MLPClassifier(),
    n_resamples=20,
    verbose=True,
    scale=False,
    over_sampling=True
)
report

100%|██████████| 19/19 [00:00<00:00, 39.94it/s]


{'precision': 0.6515704621579725,
 'recall': 0.6282608695652174,
 'f1-score': 0.6311999105397661,
 'accuracy': 0.6282608695652174}