### Contrastive learning

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
import pickle

### Load returns data

In [2]:
from utils.returns_data_class import ReturnsData
PERIOD = 1
data = ReturnsData(
    daily_returns_path="../Data/returns_df_611.csv",
    extras_path="../Data/historical_stocks.csv",
)
data.change_returns_period(PERIOD)
X = data.returns_df.values.T

No change made because period entered is 1


### Generate raw positive and negative samples

In [3]:
num_TS = X.shape[0]
num_pos_samples = 32
window_size = 5
stride = 2
# num_neg_samples = 30

print(f"Context Size: {num_pos_samples}, Period: {window_size}, Stride: {stride}")
# print(f"Number of Negative Samples: {num_neg_samples}")

from utils.context import get_tgt_context_euclidean_multiprocess
positive_tgt_context_sets = get_tgt_context_euclidean_multiprocess(ts_array=X, m=window_size, k=num_pos_samples, stride=stride, z_normalize=False, st_dev_pruning=False, verbose=True)
# negative_tgt_context_sets = get_tgt_context_euclidean_multiprocess(ts_array=X, m=period, k=num_neg_samples, stride=stride, z_normalize=False, st_dev_pruning=False, top_k=False, verbose=True)
print(f"Number (anchor, positive, negative) samples: {len(positive_tgt_context_sets)}")

Context Size: 32, Period: 5, Stride: 2


100%|██████████| 87/87 [00:07<00:00, 11.42it/s]
100%|██████████| 87/87 [00:08<00:00, 10.77it/s]
100%|██████████| 87/87 [00:08<00:00, 10.69it/s]
100%|██████████| 87/87 [00:08<00:00, 10.52it/s]
100%|██████████| 87/87 [00:08<00:00, 10.33it/s]
100%|██████████| 87/87 [00:08<00:00, 10.27it/s]
100%|██████████| 89/89 [00:08<00:00, 10.44it/s]


nearly returning
Number (anchor, positive, negative) samples: 1432184


In [4]:
import pandas as pd
import numpy as np
ticker = "JPM"
i = data.ticker2idx[ticker]
temp = np.array([xi[1] for xi in positive_tgt_context_sets if xi[0] == i]).flatten()
print(
    f"The most commonly co-occurring stocks with {ticker} are: {[data.idx2ticker[xi] for xi in pd.Series(temp).value_counts().index][:5]}"
)
temp = pd.Series(temp).value_counts()
temp.index = temp.index.map(data.idx2ticker)
temp = temp/temp.sum()
temp.iloc[:5]

The most commonly co-occurring stocks with JPM are: ['C', 'BAC', 'WFC', 'USB', 'PNC']


C      0.011305
BAC    0.010346
WFC    0.009946
USB    0.009559
PNC    0.008972
Name: count, dtype: float64

### Sample pairs from proportions test statistic

Sample positive and negative pairs based on Chi-Squared p-value. If cooccurrence is more than random with high confidence then sample more.

#### Define sampling functions



In [5]:
from utils.contrastive_helpers import get_cooccurrence_counts, get_pairwise_p_values
import pandas as pd
positive_sample_distributions = get_cooccurrence_counts(positive_tgt_context_sets, data)
pairwise_p_value_df = get_pairwise_p_values(positive_sample_distributions, data, return_dataframe=True)
counts = pd.Series([xi[0] for xi in positive_tgt_context_sets]).value_counts().sort_index()

100%|██████████| 611/611 [01:06<00:00,  9.25it/s]
100%|██████████| 611/611 [00:14<00:00, 42.11it/s]


In [7]:
del positive_tgt_context_sets

In [6]:
from tqdm import tqdm
from utils.contrastive_helpers import get_sampling_distribution, sample_tgt_context_sets_from_distribution
import pandas as pd
positive_threshold = 0.1
negative_threshold = 0.8

num_neg_samples = 300
num_pos_samples = 32

min_remaining = pairwise_p_value_df[1-pairwise_p_value_df["p_value"]<negative_threshold].groupby("query_ticker")["sample_ticker"].nunique().min()

print(f"Num negative samples: {num_neg_samples}\nMax number of negative samples possible at this threshold: {min_remaining}.")
if min_remaining<num_neg_samples:
    raise ValueError(f"Need to decrease num_neg samples to {min_remaining} or increase threshold")



num_sets = 5000
dist_shapes = []
# positive_tgt_context_sets = []
# negative_tgt_context_sets = []
index_samples = []
for ticker in tqdm(data.tickers):
    temp_ticker_distribution_pos = get_sampling_distribution(ticker, pairwise_p_value_df, sample_type="positive_samples", positive_threshold=positive_threshold)
    temp_ticker_distribution_neg = get_sampling_distribution(ticker, pairwise_p_value_df, sample_type="negative_samples", negative_threshold=negative_threshold)
    dist_shapes.append((temp_ticker_distribution_pos.shape[0], temp_ticker_distribution_neg.shape[0]))
    temp_ticker_positive_sets = sample_tgt_context_sets_from_distribution(temp_ticker_distribution_pos, n_samples=num_sets, sample_size=num_pos_samples)
    temp_ticker_negative_sets = sample_tgt_context_sets_from_distribution(temp_ticker_distribution_neg, n_samples=num_sets, sample_size=num_neg_samples)
    # positive_tgt_context_sets += temp_ticker_positive_sets
    # negative_tgt_context_sets += temp_ticker_negative_sets
    index_samples += [
        (
            data.ticker2idx[ticker],
            [data.ticker2idx[xi] for xi in p[1]],
            [data.ticker2idx[xi] for xi in n[1]],
        )
        for p, n in zip(temp_ticker_positive_sets, temp_ticker_negative_sets)
    ]

Num negative samples: 300
Max number of negative samples possible at this threshold: 344.


100%|██████████| 611/611 [08:16<00:00,  1.23it/s]


In [10]:
pairwise_p_value_df.head()

Unnamed: 0,query_ticker,sample_ticker,count,p_value,query_ticker_idx,sample_ticker_idx
0,AAPL,MSFT,324,3.899354e-74,0,354
1,AAPL,CSCO,310,1.811164e-64,0,141
2,AAPL,INTC,294,2.873598e-54,0,276
3,AAPL,ORCL,266,1.3673129999999998e-38,0,404
4,AAPL,TXN,263,4.484142e-37,0,538


In [10]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Histogram(x=pairwise_p_value_df[pairwise_p_value_df["query_ticker"]=="AAPL"]["p_value"].values,
                        #    histnorm="probability density"
                        nbinsx=20,
                           ))
fig.update_layout(template="presentation", height=500, width=800,
                  margin=dict(t=20),
                  xaxis=dict(title="p-value"),
                  yaxis=dict(title="Ticker Count")
                  )
fig.show()
fig.write_image("/Users/rian/Downloads/p_value_distribution.pdf")
# fig.write_image("/Users/rian/Downloads/AAPL_count.pdf")

In [29]:
# Combine positive and negative samples together
index_samples = []
# Handle whether the sets are ticker strings or already indices
if isinstance(positive_tgt_context_sets[0][0],(int, float)):
    needs_mapping=False
else:
    needs_mapping=True
for pos, neg in tqdm(zip(positive_tgt_context_sets,negative_tgt_context_sets), total=len(positive_tgt_context_sets)):
    if pos[0]!=neg[0]:
        raise ValueError("Sets are not aligned")
    if not needs_mapping:
        # index_samples.append((int(pos[0]), [int(x) for x in pos[1]], [int(x) for x in neg[1]]))
        index_samples.append((pos[0], pos[1], neg[1]))
    elif needs_mapping:
        index_samples.append((data.ticker2idx[pos[0]], [data.ticker2idx[xi] for xi in pos[1]], [data.ticker2idx[xi] for xi in neg[1]]))
del positive_tgt_context_sets, negative_tgt_context_sets

100%|██████████| 3055000/3055000 [02:30<00:00, 20234.82it/s]


In [33]:
from models.contrastive import ContrastiveMultiPN
from utils.contrastive_helpers import IndividualSigmoidLoss, AggregateSigmoidLoss, IndPos_AggSoftmax, AggPos_IndNeg, JointPos_MarginalNeg
# JointPos_MarginalNeg - pos weight = 7
# AggregateSigmoidLoss - pos weight = 100, neg weight=10
criterion = AggPos_IndNeg(positive_weight=10, negative_weight=1)
model = ContrastiveMultiPN(n_time_series=611, embedding_dim=16, criterion=criterion)
model.normalize_embeddings()

In [34]:
model.train(index_samples, batch_size=128, learning_rate=0.001, epochs=10, patience=3, early_stopping=True, print_every=2, regularization_weight=0.0001, update_loss_weights=False)

=== Epoch [1/10] ===
Contrastive Loss: 1.0487  |  Total Loss: 1.3236,
Positive Loss: 0.3709  |  Negative Loss: 0.6779,
Positive Weight: 10.0000  |  Negative Weight: 1.0000,
Epoch Time: 100.52 sec |  Remaining: 904.64 sec,

=== Epoch [3/10] ===
Contrastive Loss: 0.7811  |  Total Loss: 1.1156,
Positive Loss: 0.2114  |  Negative Loss: 0.5697,
Positive Weight: 10.0000  |  Negative Weight: 1.0000,
Epoch Time: 95.52 sec |  Remaining: 677.87 sec,

=== Epoch [5/10] ===
Contrastive Loss: 0.7808  |  Total Loss: 1.1156,
Positive Loss: 0.2114  |  Negative Loss: 0.5694,
Positive Weight: 10.0000  |  Negative Weight: 1.0000,
Epoch Time: 93.89 sec |  Remaining: 478.60 sec,

=== Epoch [7/10] ===
Contrastive Loss: 0.7806  |  Total Loss: 1.1154,
Positive Loss: 0.2113  |  Negative Loss: 0.5693,
Positive Weight: 10.0000  |  Negative Weight: 1.0000,
Epoch Time: 94.89 sec |  Remaining: 286.22 sec,

=== Epoch [9/10] ===
Contrastive Loss: 0.7806  |  Total Loss: 1.1154,
Positive Loss: 0.2114  |  Negative Loss: 

In [36]:
model.plot_training(skip=1)

In [41]:
from utils.sector_classification import get_sector_score

from sklearn.neighbors import KNeighborsClassifier
get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors, top_k_accuracy=True,
                 scale=True, smote=True,
                #  classifier=KNeighborsClassifier(n_neighbors=1)
                 )

Precision Score: 0.68
Recall Score: 0.66
F1 Score: 0.66
Accuracy Score: 0.66
Accuracy Score Top-3: 0.86


In [20]:
model.save_embeddings_to_csv(fname="IndPos_AggSoftmax_embeddings.csv")

In [31]:
from utils.visualisation_functions import pca_plot_from_embeddings

pca_plot_from_embeddings(
    embedding_matrix=model.embeddings.weight.detach().numpy(),
    sectors=data.sectors,
    tickers=data.tickers,
    industries=data.industries,
    names=data.names,
    dimensions=2,
    reduced=True,
    method="PCA",
    return_df=False,
    rand_state=None,
)

Temp bit for next few cells
- Try to retain positive alignment and just sample from negative distribution. Need to sample same number of negatives in same order though.

In [None]:
import pickle
import pandas as pd
with open('tgt_context_pairwise.pkl', 'rb') as handle:
    positive_tgt_context_sets = pickle.load(handle)
positive_tgt_context_sets = sorted(positive_tgt_context_sets, key=lambda x: x[0])
counts = pd.Series([xi[0] for xi in positive_tgt_context_sets]).value_counts().sort_index()

from utils.contrastive_helpers import get_cooccurrence_counts, get_pairwise_p_values
positive_sample_distributions = get_cooccurrence_counts(positive_tgt_context_sets, data)
pairwise_p_value_df = get_pairwise_p_values(positive_sample_distributions, data, return_dataframe=True)

In [4]:
with open("p_value_df_p1_daily_p10_s3_n10.pkl", 'rb') as handle:
    pairwise_p_value_df = pickle.load(handle)

In [9]:
from utils.contrastive_helpers import get_sampling_distribution, sample_tgt_context_sets_from_distribution
from tqdm import tqdm
negative_tgt_context_sets = []

num_neg_samples = 64

for idx, count in tqdm(counts.items(), total=len(data.tickers)):
    ticker = data.idx2ticker[idx]
    temp_ticker_distribution_neg = get_sampling_distribution(ticker, pairwise_p_value_df, sample_type="negative_samples", negative_threshold=0.5)
    temp_ticker_negative_sets = sample_tgt_context_sets_from_distribution(temp_ticker_distribution_neg, n_samples=count, sample_size=num_neg_samples)
    negative_tgt_context_sets += temp_ticker_negative_sets

611it [01:47,  5.70it/s]


In [15]:
negative_tgt_context_sets = [(data.ticker2idx[xi[0]], [data.ticker2idx[x] for x in xi[1]]) for xi in tqdm(negative_tgt_context_sets)]


  0%|          | 0/1860195 [00:00<?, ?it/s]

100%|██████████| 1860195/1860195 [00:24<00:00, 74606.46it/s] 


In [18]:
# Combine positive and negative samples together
index_samples = []
# Handle whether the sets are ticker strings or already indices
if isinstance(positive_tgt_context_sets[0][0],(int, float)):
    needs_mapping=False
else:
    needs_mapping=True
for pos, neg in zip(positive_tgt_context_sets,negative_tgt_context_sets):
    if pos[0]!=neg[0]:
        raise ValueError("Sets are not aligned")
    if not needs_mapping:
        index_samples.append((int(pos[0]), int(pos[1]), int(neg[1])))
    elif needs_mapping:
        index_samples.append((data.ticker2idx[pos[0]], [data.ticker2idx[xi] for xi in pos[1]], [data.ticker2idx[xi] for xi in neg[1]]))

 16%|█▋        | 302517/1860195 [02:35<13:19, 1949.43it/s]  


In [19]:
with open('pns_pairwise_pos_idx_samples.pkl', 'wb') as handle:
    pickle.dump(index_samples, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
with open("pns_pairwise_pos_idx_samples.pkl", 'rb') as handle:
    index_samples = pickle.load(handle)

In [8]:
from tqdm import tqdm
index_samples = [(int(xi[0]), [int(x) for x in xi[1]], [int(x) for x in xi[2]]) for xi in tqdm(index_samples)]

100%|██████████| 1860195/1860195 [01:23<00:00, 22363.61it/s] 


### Training
- No regularization performs better
    - Why no regularization better here but not matrix factorization?
- Loss 1 better than loss 2

In [170]:
from models.contrastive import ContrastiveMultiPN
from utils.contrastive_helpers import IndividualSigmoidLoss, AggregateSigmoidLoss, IndPos_AggSoftmax, AggPos_IndNeg, JointPos_MarginalNeg
# JointPos_MarginalNeg - pos weight = 7
# AggregateSigmoidLoss - pos weight = 20
criterion = JointPos_MarginalNeg(positive_weight=10, negative_weight=1)
model = ContrastiveMultiPN(n_time_series=611, embedding_dim=16, criterion=criterion)
model.normalize_embeddings()

In [168]:
model.criterion.positive_weight = 0

- Add some sort of margin?
- Adjust positive and negative weight dynamically to ensure a consistent pos/neg ratio?

In [None]:
# Add some sort of margin?

In [171]:
model.train(index_samples, batch_size=64, learning_rate=0.001, epochs=3, patience=3, early_stopping=True, print_every=1, regularization_weight=0.0001)

=== Epoch [1/3] ===
Contrastive Loss: 2.8414  |  Total Loss: 3.0368,
Positive Loss: 0.9698  |  Negative Loss: 1.8716,
Epoch Time: 92.39 sec |  Remaining: 184.78 sec,



KeyboardInterrupt: 

In [161]:
model.plot_training(skip=0)

In [154]:
from utils.sector_classification import get_sector_score

from sklearn.neighbors import KNeighborsClassifier
get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors, top_k_accuracy=True,
                 scale=True, smote=True,
                #  classifier=KNeighborsClassifier(n_neighbors=1)
                 )

Precision Score: 0.62
Recall Score: 0.6
F1 Score: 0.6
Accuracy Score: 0.6
Accuracy Score Top-3: 0.82


In [108]:
e = model.embeddings.weight.detach().numpy()
import numpy as np
import plotly.express as px
px.histogram(np.linalg.norm(e, axis=1))

In [19]:
model.save_embeddings_to_csv(fname="IndPos_AggSoftmax_embeddings.csv")

In [12]:
from models.embedding_models import ClassificationEmbeddings

EMBEDDING_DIM = 20
model = ClassificationEmbeddings(
    n_time_series=len(data.tickers), embedding_dim=EMBEDDING_DIM
)

In [14]:
from utils.classifier_training_helpers import train_embeddings_from_tgt_context_sets

EPOCHS = 10

model, losses = train_embeddings_from_tgt_context_sets(
    n_time_series=len(data.tickers),
    tgt_context_sets=index_samples,
    model=model,
    epochs=5,
    batch_size=64,
    early_stopping=False,
    device="cpu",
    # embedding_dim=EMBEDDING_DIM,
    verbose=True,
)

Training embeddings...


 20%|██        | 1/5 [00:26<01:46, 26.73s/it]

Epoch 0: Loss = 0.1037282788136238


 40%|████      | 2/5 [00:54<01:21, 27.07s/it]

Epoch 1: Loss = 0.10124456155388321


 60%|██████    | 3/5 [01:21<00:54, 27.47s/it]

Epoch 2: Loss = 0.09953595505930893


 80%|████████  | 4/5 [01:49<00:27, 27.33s/it]

Epoch 3: Loss = 0.09861801796170289


100%|██████████| 5/5 [02:15<00:00, 27.14s/it]

Epoch 4: Loss = 0.09813162753960566





### Evaluation

In [109]:
from utils.visualisation_functions import pca_plot_from_embeddings

pca_plot_from_embeddings(
    embedding_matrix=model.embeddings.weight.detach().numpy(),
    sectors=data.sectors,
    tickers=data.tickers,
    industries=data.industries,
    names=data.names,
    dimensions=2,
    reduced=True,
    method="PCA",
    return_df=False,
    rand_state=None,
)

In [None]:
data.sectors[data.ticker2idx["ADI"]]
data.industries[data.ticker2idx["ADI"]]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
numpy_embeddings = model.embeddings.weight.detach().numpy()
train_size = int(len(data.sectors)*0.7)
X_train = numpy_embeddings[:train_size, :]
X_test = numpy_embeddings[train_size:, :]
y_train = data.sectors[:train_size]
y_test = data.sectors[train_size:]

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from utils.ucr_helpers import evaluate_resampling_UCR
report, _, _ = evaluate_resampling_UCR(
    X_train,
    X_test,
    y_train,
    y_test,
    # classifier=SVC(kernel="rbf"),
    # classifier=LogisticRegression(),
    classifier=KNeighborsClassifier(n_neighbors=1),
    # classifier=MLPClassifier(),
    n_resamples=20,
    verbose=True,
    scale=False,
    over_sampling=True
)
report