### XXXXXX

In [4]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
import pickle

### Load returns data

In [5]:
from utils.returns_data_class import ReturnsData
PERIOD = 1
data = ReturnsData(
    daily_returns_path="../Data/returns_df_611.csv",
    extras_path="../Data/historical_stocks.csv",
)
data.change_returns_period(PERIOD)
X = data.returns_df.values.T

No change made because period entered is 1


### Generate raw positive and negative samples

In [6]:
num_TS = X.shape[0]
num_pos_samples = 10
period = 10
stride = 3
num_neg_samples = 30

print(f"Context Size: {num_pos_samples}, Period: {period}, Stride: {stride}")
print(f"Number of Negative Samples: {num_neg_samples}")

from utils.context import get_tgt_context_euclidean_multiprocess
positive_tgt_context_sets = get_tgt_context_euclidean_multiprocess(ts_array=X, m=period, k=num_pos_samples, stride=stride, z_normalize=False, st_dev_pruning=False, verbose=True)
negative_tgt_context_sets = get_tgt_context_euclidean_multiprocess(ts_array=X, m=period, k=num_neg_samples, stride=stride, z_normalize=False, st_dev_pruning=False, top_k=False, verbose=True)
print(f"Number (anchor, positive, negative) samples: {len(positive_tgt_context_sets)}")

Context Size: 10, Period: 10, Stride: 3
Number of Negative Samples: 30


100%|██████████| 87/87 [00:07<00:00, 12.10it/s]
100%|██████████| 87/87 [00:07<00:00, 12.05it/s]
100%|██████████| 87/87 [00:07<00:00, 11.97it/s]
100%|██████████| 87/87 [00:07<00:00, 12.02it/s]
100%|██████████| 89/89 [00:07<00:00, 12.20it/s]
100%|██████████| 87/87 [00:07<00:00, 11.23it/s]
100%|██████████| 87/87 [00:07<00:00, 11.14it/s]


nearly returning
Number (anchor, positive, negative) samples: 953771


#### Alternatively, load saved samples

In [None]:
import pickle
with open('samples_pairwise.pkl', 'rb') as handle:
    positive_tgt_context_sets = pickle.load(handle)

In [4]:
import pandas as pd
import numpy as np
ticker = "JPM"
i = data.ticker2idx[ticker]
temp = np.array([xi[1] for xi in positive_tgt_context_sets if xi[0] == i]).flatten()
print(
    f"The most commonly co-occurring stocks with {ticker} are: {[data.idx2ticker[xi] for xi in pd.Series(temp).value_counts().index][:5]}"
)
temp = pd.Series(temp).value_counts()
temp.index = temp.index.map(data.idx2ticker)
temp = temp/temp.sum()
temp.iloc[:5]

The most commonly co-occurring stocks with JPM are: ['C', 'BAC', 'WFC', 'USB', 'PNC']


C      0.030750
BAC    0.026393
WFC    0.024920
USB    0.023959
PNC    0.020884
Name: count, dtype: float64

### Sample pairs from proportions test statistic

Sample positive and negative pairs based on Chi-Squared p-value. If cooccurrence is more than random with high confidence then sample more.

#### Define sampling functions



In [None]:
from utils.contrastive_helpers import get_cooccurrence_counts, get_pairwise_p_values
positive_sample_distributions = get_cooccurrence_counts(positive_tgt_context_sets, data)
pairwise_p_value_df = get_pairwise_p_values(positive_sample_distributions, data, return_dataframe=True)

In [46]:
# with open('p_value_df_p1_daily_p10_s3_n10.pkl', 'wb') as handle:
#     pickle.dump(pairwise_p_value_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
with open('p_value_df_p1_daily_p10_s3_n10.pkl', 'rb') as handle:
    pairwise_p_value_df = pickle.load(handle)

In [38]:
from tqdm import tqdm
from utils.contrastive_helpers import get_sampling_distribution, sample_tgt_context_sets_from_distribution
num_neg_samples = 64
num_pos_samples = 32
num_sets = 5000
dist_shapes = []
positive_tgt_context_sets = []
negative_tgt_context_sets = []
for ticker in tqdm(data.tickers):
    temp_ticker_distribution_pos = get_sampling_distribution(ticker, pairwise_p_value_df, sample_type="positive_samples")
    temp_ticker_distribution_neg = get_sampling_distribution(ticker, pairwise_p_value_df, sample_type="negative_samples")
    dist_shapes.append((temp_ticker_distribution_pos.shape[0], temp_ticker_distribution_neg.shape[0]))
    temp_ticker_positive_sets = sample_tgt_context_sets_from_distribution(temp_ticker_distribution_pos, n_samples=num_sets, sample_size=num_pos_samples)
    temp_ticker_negative_sets = sample_tgt_context_sets_from_distribution(temp_ticker_distribution_neg, n_samples=num_sets, sample_size=num_neg_samples)
    positive_tgt_context_sets += temp_ticker_positive_sets
    negative_tgt_context_sets += temp_ticker_negative_sets

 61%|██████    | 371/611 [03:55<02:32,  1.58it/s]


KeyboardInterrupt: 

Temp bit for next few cells
- Try to retain positive alignment and just sample from negative distribution. Need to sample same number of negatives in same order though.

In [3]:
import pickle
import pandas as pd
with open('tgt_context_pairwise.pkl', 'rb') as handle:
    positive_tgt_context_sets = pickle.load(handle)
positive_tgt_context_sets = sorted(positive_tgt_context_sets, key=lambda x: x[0])
counts = pd.Series([xi[0] for xi in positive_tgt_context_sets]).value_counts().sort_index()

from utils.contrastive_helpers import get_cooccurrence_counts, get_pairwise_p_values
positive_sample_distributions = get_cooccurrence_counts(positive_tgt_context_sets, data)
pairwise_p_value_df = get_pairwise_p_values(positive_sample_distributions, data, return_dataframe=True)

100%|██████████| 611/611 [05:17<00:00,  1.93it/s]
  0%|          | 1/611 [00:40<6:55:19, 40.85s/it]


KeyboardInterrupt: 

In [4]:
with open("p_value_df_p1_daily_p10_s3_n10.pkl", 'rb') as handle:
    pairwise_p_value_df = pickle.load(handle)

In [9]:
from utils.contrastive_helpers import get_sampling_distribution, sample_tgt_context_sets_from_distribution
from tqdm import tqdm
negative_tgt_context_sets = []

num_neg_samples = 64

for idx, count in tqdm(counts.items(), total=len(data.tickers)):
    ticker = data.idx2ticker[idx]
    temp_ticker_distribution_neg = get_sampling_distribution(ticker, pairwise_p_value_df, sample_type="negative_samples", negative_threshold=0.5)
    temp_ticker_negative_sets = sample_tgt_context_sets_from_distribution(temp_ticker_distribution_neg, n_samples=count, sample_size=num_neg_samples)
    negative_tgt_context_sets += temp_ticker_negative_sets

611it [01:47,  5.70it/s]


In [15]:
negative_tgt_context_sets = [(data.ticker2idx[xi[0]], [data.ticker2idx[x] for x in xi[1]]) for xi in tqdm(negative_tgt_context_sets)]


  0%|          | 0/1860195 [00:00<?, ?it/s]

100%|██████████| 1860195/1860195 [00:24<00:00, 74606.46it/s] 


In [18]:
# Combine positive and negative samples together
index_samples = []
# Handle whether the sets are ticker strings or already indices
if isinstance(positive_tgt_context_sets[0][0],(int, float)):
    needs_mapping=False
else:
    needs_mapping=True
for pos, neg in zip(positive_tgt_context_sets,negative_tgt_context_sets):
    if pos[0]!=neg[0]:
        raise ValueError("Sets are not aligned")
    if not needs_mapping:
        index_samples.append((int(pos[0]), int(pos[1]), int(neg[1])))
    elif needs_mapping:
        index_samples.append((data.ticker2idx[pos[0]], [data.ticker2idx[xi] for xi in pos[1]], [data.ticker2idx[xi] for xi in neg[1]]))

 16%|█▋        | 302517/1860195 [02:35<13:19, 1949.43it/s]  


In [19]:
with open('pns_pairwise_pos_idx_samples.pkl', 'wb') as handle:
    pickle.dump(index_samples, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
with open("pns_pairwise_pos_idx_samples.pkl", 'rb') as handle:
    index_samples = pickle.load(handle)

In [7]:
from tqdm import tqdm
index_samples = [(int(xi[0]), [int(x) for x in xi[1]], [int(x) for x in xi[2]]) for xi in tqdm(index_samples)]

100%|██████████| 1860195/1860195 [01:09<00:00, 26608.07it/s] 


### Training
- No regularization performs better
    - Why no regularization better here but not matrix factorization?
- Loss 1 better than loss 2

In [21]:
from models.contrastive import ContrastiveMultiPN
from utils.contrastive_helpers import IndividualSigmoidLoss, AggregateSigmoidLoss, IndPos_AggSoftmax, AggPos_IndNeg, JointPos_MarginalNeg
model = ContrastiveMultiPN(n_time_series=611, embedding_dim=16, criterion=JointPos_MarginalNeg(positive_weight=7))

In [31]:
model.train(index_samples, batch_size=64, learning_rate=0.0001, epochs=10, patience=3, early_stopping=True, print_every=2, regularization_weight=0.0001)

=== Epoch [1/10] ===
Contrastive Loss: 2.2792  |  Total Loss: 2.5536,
Positive Loss: 0.7831  |  Negative Loss: 1.4961,
Epoch Time: 80.94 sec |  Remaining: 728.50 sec,

=== Epoch [3/10] ===
Contrastive Loss: 2.2725  |  Total Loss: 2.5521,
Positive Loss: 0.7818  |  Negative Loss: 1.4906,
Epoch Time: 81.97 sec |  Remaining: 575.52 sec,

=== Epoch [5/10] ===
Contrastive Loss: 2.2676  |  Total Loss: 2.5511,
Positive Loss: 0.7812  |  Negative Loss: 1.4863,
Epoch Time: 82.18 sec |  Remaining: 410.63 sec,

=== Epoch [7/10] ===
Contrastive Loss: 2.2644  |  Total Loss: 2.5506,
Positive Loss: 0.7807  |  Negative Loss: 1.4837,
Epoch Time: 81.56 sec |  Remaining: 246.25 sec,

=== Epoch [9/10] ===
Contrastive Loss: 2.2622  |  Total Loss: 2.5502,
Positive Loss: 0.7807  |  Negative Loss: 1.4815,
Epoch Time: 82.39 sec |  Remaining: 82.04 sec,

=== Epoch [10/10] ===
Contrastive Loss: 2.2613  |  Total Loss: 2.5501,
Positive Loss: 0.7805  |  Negative Loss: 1.4809,
Epoch Time: 84.80 sec |  Remaining: 0.00 

In [32]:
model.plot_training()

In [38]:
from utils.sector_classification import get_sector_score

from sklearn.neighbors import KNeighborsClassifier
get_sector_score(model.embeddings.weight.detach().numpy(), sectors=data.sectors, top_k_accuracy=True,
                 scale=True, smote=True,
                #  classifier=KNeighborsClassifier(n_neighbors=1)
                 )

Precision Score: 0.62
Recall Score: 0.59
F1 Score: 0.6
Accuracy Score: 0.59
Accuracy Score Top-3: 0.83


In [21]:
len(index_samples[0][1])

32

In [40]:
model.save_embeddings_to_csv(fname="JointPos_MarginalNeg_mean_embeddings.csv")

In [12]:
from models.embedding_models import ClassificationEmbeddings

EMBEDDING_DIM = 20
model = ClassificationEmbeddings(
    n_time_series=len(data.tickers), embedding_dim=EMBEDDING_DIM
)

In [14]:
from utils.classifier_training_helpers import train_embeddings_from_tgt_context_sets

EPOCHS = 10

model, losses = train_embeddings_from_tgt_context_sets(
    n_time_series=len(data.tickers),
    tgt_context_sets=index_samples,
    model=model,
    epochs=5,
    batch_size=64,
    early_stopping=False,
    device="cpu",
    # embedding_dim=EMBEDDING_DIM,
    verbose=True,
)

Training embeddings...


 20%|██        | 1/5 [00:26<01:46, 26.73s/it]

Epoch 0: Loss = 0.1037282788136238


 40%|████      | 2/5 [00:54<01:21, 27.07s/it]

Epoch 1: Loss = 0.10124456155388321


 60%|██████    | 3/5 [01:21<00:54, 27.47s/it]

Epoch 2: Loss = 0.09953595505930893


 80%|████████  | 4/5 [01:49<00:27, 27.33s/it]

Epoch 3: Loss = 0.09861801796170289


100%|██████████| 5/5 [02:15<00:00, 27.14s/it]

Epoch 4: Loss = 0.09813162753960566





### Evaluation

In [39]:
from utils.visualisation_functions import pca_plot_from_embeddings

pca_plot_from_embeddings(
    embedding_matrix=model.embeddings.weight.detach().numpy(),
    sectors=data.sectors,
    tickers=data.tickers,
    industries=data.industries,
    names=data.names,
    dimensions=2,
    reduced=True,
    method="PCA",
    return_df=False,
    rand_state=None,
)

In [None]:
data.sectors[data.ticker2idx["ADI"]]
data.industries[data.ticker2idx["ADI"]]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
numpy_embeddings = model.embeddings.weight.detach().numpy()
train_size = int(len(data.sectors)*0.7)
X_train = numpy_embeddings[:train_size, :]
X_test = numpy_embeddings[train_size:, :]
y_train = data.sectors[:train_size]
y_test = data.sectors[train_size:]

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from utils.ucr_helpers import evaluate_resampling_UCR
report, _, _ = evaluate_resampling_UCR(
    X_train,
    X_test,
    y_train,
    y_test,
    # classifier=SVC(kernel="rbf"),
    # classifier=LogisticRegression(),
    classifier=KNeighborsClassifier(n_neighbors=1),
    # classifier=MLPClassifier(),
    n_resamples=20,
    verbose=True,
    scale=False,
    over_sampling=True
)
report