### XXXXXX

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')

In [2]:
from utils.returns_data_class import ReturnsData
PERIOD = 1
data = ReturnsData(
    daily_returns_path="../Data/returns_df_611.csv",
    extras_path="../Data/historical_stocks.csv",
)
data.change_returns_period(PERIOD)
X = data.returns_df.values.T

No change made because period entered is 1


In [3]:
num_TS = X.shape[0]
num_pos_samples = 10
period = 10
stride = 3
num_neg_samples = 30

print(f"Context Size: {num_pos_samples}, Period: {period}, Stride: {stride}")
print(f"Number of Negative Samples: {num_neg_samples}")

from utils.context import get_tgt_context_euclidean_multiprocess
positive_tgt_context_sets = get_tgt_context_euclidean_multiprocess(ts_array=X, m=period, k=num_pos_samples, stride=stride, z_normalize=False, st_dev_pruning=False, verbose=True)
negative_tgt_context_sets = get_tgt_context_euclidean_multiprocess(ts_array=X, m=period, k=num_neg_samples, stride=stride, z_normalize=False, st_dev_pruning=False, top_k=False, verbose=True)
print(f"Number (anchor, positive, negative) samples: {len(positive_tgt_context_sets)}")

Context Size: 10, Period: 10, Stride: 3
Number of Negative Samples: 30


100%|██████████| 87/87 [00:07<00:00, 11.59it/s]
100%|██████████| 87/87 [00:07<00:00, 11.44it/s]
100%|██████████| 87/87 [00:07<00:00, 11.61it/s]
100%|██████████| 87/87 [00:07<00:00, 11.68it/s]
100%|██████████| 87/87 [00:07<00:00, 11.41it/s]
100%|██████████| 89/89 [00:07<00:00, 11.69it/s]
100%|██████████| 87/87 [00:08<00:00, 10.84it/s]


nearly returning
Number (anchor, positive, negative) samples: 953771


In [3]:
import pickle
with open('samples_pairwise.pkl', 'rb') as handle:
    positive_tgt_context_sets = pickle.load(handle)

In [4]:
len(positive_tgt_context_sets)

1860195

In [5]:
import pandas as pd
import numpy as np
ticker = "JPM"
i = data.ticker2idx[ticker]
temp = np.array([xi[1] for xi in positive_tgt_context_sets if xi[0] == i]).flatten()
print(
    f"The most commonly co-occurring stocks with {ticker} are: {[data.idx2ticker[xi] for xi in pd.Series(temp).value_counts().index][:5]}"
)
temp = pd.Series(temp).value_counts()
temp.index = temp.index.map(data.idx2ticker)
temp = temp/temp.sum()
temp.iloc[:5]

The most commonly co-occurring stocks with JPM are: ['C', 'BAC', 'STI', 'WFC', 'PNC']


C      0.004388
BAC    0.004376
STI    0.003696
WFC    0.003613
PNC    0.003565
Name: count, dtype: float64

Sample positive and negative pairs based on Chi-Squared p-value. If cooccurrence is more than random with high confidence then sample more.



In [6]:
from tqdm import tqdm

def get_cooccurrence_counts(tgt_context_sets, data:ReturnsData):
    distributions = {}
    for ticker in tqdm(data.tickers):
        i = data.ticker2idx[ticker]
        all_samples = np.array([xi[1] for xi in tgt_context_sets if xi[0] == i]).flatten()
        sample_count = pd.Series(all_samples).value_counts()
        sample_count.index = sample_count.index.map(data.idx2ticker)
        sample_count = sample_count.to_dict()
        # Add zero cooccurrences
        zero_cooccurrences = set(data.tickers)-set(sample_count.keys())-set([ticker])
        sample_count.update(dict.fromkeys(list(zero_cooccurrences), 0))

        distributions[ticker] = sample_count
    return distributions
positive_sample_distributions = get_cooccurrence_counts(positive_tgt_context_sets, data)

100%|██████████| 611/611 [16:53<00:00,  1.66s/it]


In [8]:
import pandas as pd
from scipy.stats import norm
from typing import Literal

def test_ticker_cooccurrence_significance(
    t1: str,
    t2: str,
    distributions: dict,
    test_direction: Literal["positive_samples", "negative_samples"] = "positive_samples",
    alpha: float | None = None,
    verbose: bool = False,
):
    # Convert to DataFrame
    df = pd.DataFrame(distributions).fillna(0)

    # Calculate total counts
    total_counts = df.sum().sum()

    # Observed count for stock ticker 1 cooccurring with stock ticker 2
    observed_count = df.loc[t2, t1]

    # Expected count under equal frequency assumption
    # (1/num_TS)*int((X.shape[1]-period)/stride) * num_pos_samples
    expected_count = df[t1].sum() / len(df)

    # Perform a test
    test_statistic = (observed_count - expected_count) / np.sqrt(
        expected_count * (1 - expected_count / total_counts)
    )
    # p_value = norm.sf(abs(z_score))  # two-tailed test
    if test_direction=="positive_samples":
        p_value = norm.sf(test_statistic)  # one-tailed test
    elif test_direction=="negative_samples":
        p_value = norm.cdf(test_statistic)

    if verbose:
        print(f"Observed Count: {observed_count}")
        print(f"Expected Count: {expected_count}")
        print(f"Test Statistic: {test_statistic}")
        print(f"P-value: {p_value}")
    if alpha is None:
        return p_value
    else:
        return p_value < alpha

test_ticker_cooccurrence_significance("JPM", "C", positive_sample_distributions, verbose=True, test_direction="positive_samples")

Observed Count: 368.0
Expected Count: 137.27004909983634
Test Statistic: 19.69321307609936
P-value: 1.2327194788625982e-86


1.2327194788625982e-86

In [9]:
samples = []

for ticker in tqdm(data.tickers):
    for t, c in positive_sample_distributions[ticker].items():
        p_value_positive = test_ticker_cooccurrence_significance(
            ticker,
            t,
            positive_sample_distributions,
            test_direction="positive_samples",
        )

        samples.append((ticker,t,c,p_value_positive))


100%|██████████| 611/611 [5:34:11<00:00, 32.82s/it]  


In [10]:
import pickle
with open('samples_pairwisemmm.pkl', 'wb') as f:
    pickle.dump(samples, f)

### Load samples

In [3]:
import pickle
with open('samples_pairwise.pkl', 'rb') as handle:
    samples = pickle.load(handle)

In [4]:
import pandas as pd
samples_df = pd.DataFrame(samples, columns=["query_ticker", "sample_ticker", "count", "p_value"])
samples_df["query_ticker_idx"] = samples_df["query_ticker"].map(data.ticker2idx)
samples_df["sample_ticker_idx"] = samples_df["sample_ticker"].map(data.ticker2idx)

In [5]:
from typing import Literal
def get_distribution(ticker, samples_df, sample_type:Literal["positive_samples", "negative_samples"]="positive_samples"):
    filtered_df = samples_df[samples_df["query_ticker"]==ticker].copy()
    if sample_type=="positive_samples":
        filtered_df = filtered_df[filtered_df["p_value"]<0.1].copy()
    elif sample_type=="negative_samples":
        filtered_df = filtered_df[filtered_df["p_value"]>0.5].copy()
    else:
        raise ValueError("Invalid sample_type")
    filtered_df["pos_prob"] = filtered_df["count"]/filtered_df["count"].sum()
    # -- Square to make high values more prominent
    filtered_df["pos_prob_sq"] = filtered_df["pos_prob"]**2
    filtered_df["pos_prob_sq"] = filtered_df["pos_prob_sq"]/filtered_df["pos_prob_sq"].sum()
    filtered_distribution = filtered_df[["query_ticker", "sample_ticker", "pos_prob_sq"]].values
    #-- Don't square and retain original dist
    # filtered_distribution = filtered_df[["query_ticker", "sample_ticker", "pos_prob"]].values
    return filtered_distribution

In [6]:
def sample_tgt_context_sets(filtered_distribution, n_samples, sample_size):
    # Unique first elements
    unique_first_elements = np.unique(filtered_distribution[:, 0])
    if len(unique_first_elements)>1:
        raise ValueError("Should only be a single distribution")
    first_elem = unique_first_elements[0]

    sampled_pairs = []

    # Filter distribution for the current first element
    current_distribution = filtered_distribution[filtered_distribution[:, 0] == first_elem]

    # Check if there are enough elements to sample
    if len(current_distribution) < sample_size:
        raise ValueError(f"Not enough elements to sample {sample_size} times for '{first_elem}'")

    # Extract second elements and probabilities
    second_elements = current_distribution[:, 1]
    probabilities = current_distribution[:, 2].astype(float)

    # Sample multiple times
    # samples_for_first_elem = []
    for _ in range(n_samples):
        # Sample without replacement
        sampled_indices = np.random.choice(len(probabilities), size=sample_size, replace=False, p=probabilities)
        sampled_second_elements = second_elements[sampled_indices]
        # samples_for_first_elem.append(list(sampled_second_elements))

        # Append to result
        sampled_pairs.append((first_elem, list(sampled_second_elements)))

    return sampled_pairs


In [7]:
from tqdm import tqdm
import numpy as np
num_neg_samples = 5
num_pos_samples = 32
num_sets = 1000
dist_shapes = []
positive_tgt_context_sets = []
negative_tgt_context_sets = []
for ticker in tqdm(data.tickers):
    temp_ticker_distribution_pos = get_distribution(ticker, samples_df, sample_type="positive_samples")
    temp_ticker_distribution_neg = get_distribution(ticker, samples_df, sample_type="negative_samples")
    dist_shapes.append((temp_ticker_distribution_pos.shape[0], temp_ticker_distribution_neg.shape[0]))
    temp_ticker_positive_sets = sample_tgt_context_sets(temp_ticker_distribution_pos, n_samples=num_sets, sample_size=num_pos_samples)
    temp_ticker_negative_sets = sample_tgt_context_sets(temp_ticker_distribution_neg, n_samples=num_sets, sample_size=num_neg_samples)
    positive_tgt_context_sets += temp_ticker_positive_sets
    negative_tgt_context_sets += temp_ticker_negative_sets

100%|██████████| 611/611 [00:45<00:00, 13.44it/s]


In [16]:
import plotly.express as px
px.histogram(np.array(dist_shapes)[:,1])

In [17]:
import plotly.express as px
fig = px.scatter(np.cumsum(temp_ticker_distribution_pos[:,2][::-1]))
fig.update_layout(template="plotly_dark")

In [66]:
import plotly.express as px
fig = px.scatter(temp)
fig.update_layout(template="plotly_dark")

In [8]:
index_samples = []
for pos, neg in zip(positive_tgt_context_sets,negative_tgt_context_sets):
    # index_samples.append((pos[0], pos[1], neg[1]))
    index_samples.append((data.ticker2idx[pos[0]], [data.ticker2idx[xi] for xi in pos[1]], [data.ticker2idx[xi] for xi in neg[1]]))

### Experimental Contrastive Loss

In [112]:
import torch





# def experimental_loss(
#     anchor_embeddings: torch.Tensor,
#     positive_embeddings: torch.Tensor,
#     negative_embeddings: torch.Tensor,
# ) -> torch.Tensor:
#     """

#     Args:
#         anchor_embeddings (torch.Tensor): shape (batch_size, embedding_dim)
#         positive_embeddings (torch.Tensor): (batch_size, num_pos_samples, embedding_dim)
#         negative_embeddings (torch.Tensor): (batch_size, num_neg_samples, embedding_dim)

#     Returns:
#         torch.Tensor: _description_
#     """
#     positive_scores = torch.einsum(
#             "bpd,bd->bp", [positive_embeddings, anchor_embeddings]
#         )
#     negative_scores = torch.einsum(
#             "bnd,bd->bn", [negative_embeddings, anchor_embeddings]
#         )
#     # positive_loss = - torch.sum(torch.nn.functional.logsigmoid(positive_scores), dim=1)
#     # negative_loss = - torch.sum(torch.log(1-torch.sigmoid(negative_scores)), dim=1)
#     criterion = torch.nn.BCEWithLogitsLoss()
#     positive_loss = criterion(positive_scores, torch.ones_like(positive_scores))
#     negative_loss = criterion(negative_scores, torch.zeros_like(negative_scores))
#     loss = torch.sum(positive_loss + negative_loss)

#     return loss

def experimental_loss(
    anchor_embeddings: torch.Tensor,
    positive_embeddings: torch.Tensor,
    negative_embeddings: torch.Tensor,
) -> torch.Tensor:
    """

    Args:
        anchor_embeddings (torch.Tensor): shape (batch_size, embedding_dim)
        positive_embeddings (torch.Tensor): (batch_size, num_pos_samples, embedding_dim)
        negative_embeddings (torch.Tensor): (batch_size, num_neg_samples, embedding_dim)

    Returns:
        torch.Tensor: _description_
    """
    positive_scores = torch.einsum(
            "bd,bd->b", [torch.mean(positive_embeddings, dim=1), anchor_embeddings]
        )
    negative_scores = torch.einsum(
            "bd,bd->b", [torch.mean(negative_embeddings, dim=1), anchor_embeddings]
        )
    # positive_loss = - torch.sum(torch.nn.functional.logsigmoid(positive_scores))
    # negative_loss = - torch.sum(torch.log(1-torch.sigmoid(negative_scores)+0.00001))
    criterion = torch.nn.BCEWithLogitsLoss()
    positive_loss = criterion(positive_scores, torch.ones_like(positive_scores))
    negative_loss = criterion(negative_scores, torch.zeros_like(negative_scores))

    loss = positive_loss + negative_loss

    return loss

def experimental_loss(
    anchor_embeddings: torch.Tensor,
    positive_embeddings: torch.Tensor,
    negative_embeddings: torch.Tensor,
    positive_weight: float=1,
) -> torch.Tensor:
    """

    Args:
        anchor_embeddings (torch.Tensor): shape (batch_size, embedding_dim)
        positive_embeddings (torch.Tensor): (batch_size, num_pos_samples, embedding_dim)
        negative_embeddings (torch.Tensor): (batch_size, num_neg_samples, embedding_dim)

    Returns:
        torch.Tensor: _description_
    """
    ### POSITIVE LOSS - Sigmoid over each positive
    positive_scores = torch.einsum(
            "bpd,bd->bp", [positive_embeddings, anchor_embeddings]
        )
    criterion = torch.nn.BCEWithLogitsLoss()
    positive_loss = criterion(positive_scores, torch.ones_like(positive_scores))

    ### NEGATIVE LOSS - Softmax with aggregate positive representation
    criterion = torch.nn.NLLLoss()
    aggregate_positive_embeddings = torch.mean(positive_embeddings, dim=1) # (batch_size, embedding_dim)
    aggregate_positive_scores = torch.einsum(
            "bd,bd->b", [aggregate_positive_embeddings, anchor_embeddings]
        )
    negative_scores = torch.einsum(
            "bnd,bd->bn", [negative_embeddings, anchor_embeddings]
        )
    #-- Make first column the aggregate positive scores
    #- concatenated.shape = (batch_size, 1+num_neg_samples)
    concatenated = torch.concat((aggregate_positive_scores.unsqueeze(1), negative_scores), dim=1)
    #-- Create target for NLLLoss
    # zeros indicate the first element of concatenated is the target class, which corresponds to the positive sample
    target = torch.zeros(concatenated.shape[0], dtype=torch.long)

    negative_loss = criterion(torch.nn.functional.log_softmax(concatenated, dim=1), target)
    loss = positive_weight * positive_loss + negative_loss

    return loss


def experimental_loss(
    anchor_embeddings: torch.Tensor,
    positive_embeddings: torch.Tensor,
    negative_embeddings: torch.Tensor,
) -> torch.Tensor:
    """

    Args:
        anchor_embeddings (torch.Tensor): shape (batch_size, embedding_dim)
        positive_embeddings (torch.Tensor): (batch_size, num_pos_samples, embedding_dim)
        negative_embeddings (torch.Tensor): (batch_size, num_neg_samples, embedding_dim)

    Returns:
        torch.Tensor: _description_
    """
    ### POSITIVE LOSS - done with aggregation
    positive_scores = torch.einsum(
            "bd,bd->b", [torch.mean(positive_embeddings, dim=1), anchor_embeddings]
        )

    criterion = torch.nn.BCEWithLogitsLoss()
    positive_loss = criterion(positive_scores, torch.ones_like(positive_scores))

    ### NEGATIVE LOSS - Sigmoid over each negative
    negative_scores = torch.einsum(
            "bnd,bd->bn", [negative_embeddings, anchor_embeddings]
        )
    criterion = torch.nn.BCEWithLogitsLoss()
    negative_loss = criterion(negative_scores, torch.zeros_like(negative_scores))
    loss = positive_loss + negative_loss

    return loss


### Training
- No regularization performs better
    - Why no regularization better here but not matrix factorization?
- Loss 1 better than loss 2

In [43]:
from models.contrastive import ContrastiveMultiPN
from utils.contrastive_helpers import IndividualSigmoidLoss, AggregateSigmoidLoss, IndPos_AggSoftmax, AggPos_IndNeg

In [44]:
model = ContrastiveMultiPN(n_time_series=611, embedding_dim=16, criterion=IndividualSigmoidLoss())
model.train(index_samples, batch_size=64, learning_rate=0.0001, epochs=3)

Epoch [1/3], Loss: 28070.0929
Epoch [2/3], Loss: 19902.6900
Epoch [3/3], Loss: 14552.0511


In [45]:
model.plot_training()

In [46]:
from utils.sector_classification import get_sector_score

from sklearn.neighbors import KNeighborsClassifier
get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors, top_k_accuracy=True,
                #  classifier=KNeighborsClassifier(n_neighbors=1)
                 )


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Precision Score: 0.26
Recall Score: 0.27
F1 Score: 0.26
Accuracy Score: 0.27
Accuracy Score Top-3: 0.56



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [49]:
import pickle
with open('tgt_context_pairwise.pkl', 'rb') as handle:
    samples = pickle.load(handle)

### Evaluation

In [47]:
from utils.visualisation_functions import pca_plot_from_embeddings

pca_plot_from_embeddings(
    embedding_matrix=model.embeddings.weight.detach().numpy(),
    sectors=data.sectors,
    tickers=data.tickers,
    industries=data.industries,
    names=data.names,
    dimensions=2,
    reduced=True,
    method="PCA",
    return_df=False,
    rand_state=None,
)

In [48]:
data.sectors[data.ticker2idx["ADI"]]
data.industries[data.ticker2idx["ADI"]]

'SEMICONDUCTORS'

In [46]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
numpy_embeddings = model.embeddings.weight.detach().numpy()
train_size = int(len(data.sectors)*0.7)
X_train = numpy_embeddings[:train_size, :]
X_test = numpy_embeddings[train_size:, :]
y_train = data.sectors[:train_size]
y_test = data.sectors[train_size:]

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from utils.ucr_helpers import evaluate_resampling_UCR
report, _, _ = evaluate_resampling_UCR(
    X_train,
    X_test,
    y_train,
    y_test,
    # classifier=SVC(kernel="rbf"),
    # classifier=LogisticRegression(),
    classifier=KNeighborsClassifier(n_neighbors=1),
    # classifier=MLPClassifier(),
    n_resamples=20,
    verbose=True,
    scale=False,
    over_sampling=True
)
report

100%|██████████| 19/19 [00:00<00:00, 34.49it/s]


{'precision': 0.5626492769847343,
 'recall': 0.5364130434782608,
 'f1-score': 0.5407978200785897,
 'accuracy': 0.5364130434782608}