In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

import pandas as pd
meta_df = pd.read_csv("../Data/UCR_Summary.csv")
meta_df["learned_w"] = meta_df["DTW (learned_w) "].apply(lambda xi: int(xi.split("(")[-1].split(")")[0]))
meta_df.index = meta_df["Name"].str.lower()
meta_df.columns = [c.strip() for c in meta_df.columns]

results_df = pd.read_csv("../Data/UCR_results.csv", delimiter="\t", encoding="utf-16", index_col=0)

### Load the data

In [3]:
from utils.ucr_helpers import UCR_Data
# data = UCR_Data("Mallat")
data = UCR_Data("Coffee")
# data = UCR_Data("ArrowHead")

In [4]:
print(data.summary)

Number of classes: 2
Number of training samples: 28
Number of test samples: 28
Length of time series: 286


In [5]:
fig = data.plot_fig()
fig.show()

In [432]:
import numpy as np
i = np.random.randint(low=0, high=data.X.shape[0])
query_matrix = np.tile(data.X[i], reps=(data.X.shape[0],1))

neighbours = np.argsort(np.sum(np.abs(query_matrix - data.X), axis=1))[1:10]
print(data.y[i])
neighbours_idxs = [data.y[j] for j in neighbours]
np.mean(np.tile(data.y[i], len(neighbours_idxs)) == neighbours_idxs)

1


0.5555555555555556

### Compute Similarity Matrix

In [7]:
from models.embedding_models import MatrixFactorization
# similarity_matrix = MatrixFactorization.get_correlation_matrix(data.X, scaled="normal", truncate=True)
similarity_matrix = MatrixFactorization.get_euclidean_matrix(data.X, scaled="normal", verbose=True, return_similarity=True, truncate=True)
# similarity_matrix = MatrixFactorization.get_euclidean_matrix(np.diff(data.X, axis=1), scaled="normal", verbose=True, return_similarity=True)
# similarity_matrix = MatrixFactorization.get_dtw_matrix(
#     data.X,
#     max_warping_window=meta_df[meta_df.Name == data.name]["learned_w"].iloc[0],
#     return_similarity=True,
#     scaled="normal"
# )

In [10]:
import plotly.express as px
import numpy as np
fig = px.histogram(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)], nbins=30)
fig.update_layout(template="plotly_white", width=500, height=300, showlegend=False)

- Add in distribution of similarity values here for various transformations

### Learn embeddings

In [12]:
model, losses, learning_rates = MatrixFactorization.train_MF_model(
    n_time_series=data.X.shape[0],
    similarity_matrix=similarity_matrix,
    early_stopping=False,
    embedding_dim=32,
    regularization_loss_weight=0.1,
    noise_mask=False,
)

100%|██████████| 300/300 [00:00<00:00, 2046.64it/s]


In [13]:
MatrixFactorization.plot_embedding_training(losses, learning_rates, verbose=True)

Final pairwise_loss: 40.91165493136276


### Test Performance

In [15]:
from utils.ucr_helpers import get_eval_df
get_eval_df(data, model.embeddings.weight.detach().numpy(), df=None)


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score,accuracy
dataset,method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Coffee,1NN Raw,1.0,1.0,1.0,1.0
Coffee,C22 1NN,0.967,0.964,0.964,0.964
Coffee,C22 SVC,0.967,0.964,0.964,0.964
Coffee,Prop 1NN,1.0,1.0,1.0,1.0
Coffee,Prop MLP,0.937,0.929,0.928,0.929
Coffee,Prop SVC,0.937,0.929,0.928,0.929


## Visualization

In [16]:
MatrixFactorization.plot_with_dimensionality_reduction(model.embeddings.weight.detach().numpy(), data.y, method='pca')

Write function to plot based on index and show that misclassified points are identical to nearest neighbours? Also there appears to be little subclusters within the group 5 and 3 clusters

## Experimental
### Pytorch Classifier

In [14]:
import torch.nn as nn
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm

X = model.embeddings.weight.detach()
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data.y)
y = torch.tensor(y, dtype=torch.long)
n_train = data.X_train.shape[0]

# Split the data into training and test sets
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]

if True:
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    ros = RandomOverSampler()
    X_train, y_train = ros.fit_resample(X_train,y_train)
    X_train = torch.tensor(X_train)
    y_train = torch.tensor(y_train)

# Convert to TensorDatasets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Create DataLoaders
# You can define your own batch_size and whether to shuffle the data
batch_size = len(train_dataset)  # Example batch size
shuffle = True   # Shuffle the training data

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class EntityClassifier(nn.Module):
    def __init__(self, embedding_size, num_classes):
        super(EntityClassifier, self).__init__()
        self.fc1 = nn.Linear(embedding_size, 32)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(32, 32)
        self.batchnorm = nn.BatchNorm1d(32)
        self.fc3 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        # x = self.batchnorm(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Function to evaluate the model
def evaluate_model_pytorch(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for embeddings, labels in test_loader:
            outputs = model(embeddings)
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_predictions, all_labels

LEARNING_RATE = 0.01
EPOCHS = 500
clf = EntityClassifier(embedding_size=model.embedding_dim, num_classes=data.n_classes)
optimizer = torch.optim.Adam(clf.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=10, threshold=0.01
)
criterion = torch.nn.CrossEntropyLoss()
training = []
for epoch in tqdm(range(EPOCHS)):
    clf.train()
    epoch_loss = 0
    for i, (embeddings, labels) in enumerate(train_loader):
        outputs = clf(embeddings)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss+=loss.item()
    scheduler.step(epoch_loss)


    y_preds, y_true = evaluate_model_pytorch(clf, train_loader)
    train_accuracy = accuracy_score(y_preds, y_true)
    y_preds, y_true = evaluate_model_pytorch(clf, test_loader)
    test_accuracy = accuracy_score(y_preds, y_true)

    training.append((epoch_loss, optimizer.param_groups[0]["lr"], train_accuracy, test_accuracy))





100%|██████████| 500/500 [00:04<00:00, 102.75it/s]


In [15]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np



# Evaluate the model
predictions, true_labels = evaluate_model_pytorch(clf, test_loader)

# Print the classification report
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

           1       0.89      0.81      0.85       294
           2       1.00      1.00      1.00       292
           3       0.91      1.00      0.95       294
           4       0.82      0.89      0.85       289
           5       1.00      0.95      0.97       298
           6       1.00      0.95      0.98       294
           7       1.00      1.00      1.00       291
           8       0.99      0.99      0.99       293

    accuracy                           0.95      2345
   macro avg       0.95      0.95      0.95      2345
weighted avg       0.95      0.95      0.95      2345



In [16]:
total_losses, learning_rates, train_accuracies, test_accuracies = zip(*training)
print(np.mean(test_accuracies[-10:]))
print(np.max(test_accuracies))

0.9488272921108741
0.9641791044776119


In [17]:
import plotly.graph_objects as go

# Unpack the losses
total_losses, learning_rates, train_accuracies, test_accuracies = zip(*training)

# Create a figure
fig = go.Figure()

# Add traces for pairwise and regularization losses
fig.add_trace(go.Scatter(x=list(range(len(total_losses))), y=total_losses, mode='lines', name='Total Loss'))

# Create a secondary y-axis for the total loss
fig.update_layout(
    yaxis=dict(title='Pairwise and Regularization Loss'),
    yaxis2=dict(title='Learning Rate', overlaying='y', side='right')
)

# Add the total loss trace
fig.add_trace(go.Scatter(x=list(range(len(learning_rates))), y=learning_rates, mode='lines', name='Learning rate', yaxis='y2'))
fig.add_trace(go.Scatter(x=list(range(len(train_accuracies))), y=train_accuracies, mode='lines', name='Train Acc', yaxis='y2'))
fig.add_trace(go.Scatter(x=list(range(len(test_accuracies))), y=test_accuracies, mode='lines', name='Test Acc', yaxis='y2'))

# Update layout
fig.update_layout(title='Losses During Training', xaxis_title='Epoch', yaxis_title='Loss')
fig.update_layout(template='plotly_dark')

# Show the figure
fig.show()

### Grid search to find best params

In [30]:
import itertools
import torch
from tqdm import tqdm
from models.embedding_models import MatrixFactorization
from typing import List, Tuple


# Define the hyperparameter grid
hyperparameter_grid = {
    'embedding_dim': [30,50,100],
    'learning_rate': [0.05],
    'epochs': [300],
    # 'regularization_loss_weight': [0.05, 0.1, 0.15],
    # 'pairwise_loss_weight': [0.001, 0.01, 0.1]
}



# Function to train and evaluate the model
def train_and_evaluate(hyperparams, data):
    # Unpack hyperparameters
    embedding_dim, learning_rate, epochs= hyperparams

    # Initialize and train Matrix Factorization model
    model = MatrixFactorization(n_time_series=data.X.shape[0], embedding_dim=embedding_dim, normalize=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="min", factor=0.8, patience=10, threshold=0.1
    )

    for epoch in range(epochs):
        optimizer.zero_grad()
        total_loss, pairwise_loss, regularization_loss = calculate_losses(model, correlations)
        total_loss.backward()
        optimizer.step()
        scheduler.step(pairwise_loss)

    # Prepare data for evaluation
    embeddings = model.embeddings.weight.detach().numpy()
    train_size = data.X_train.shape[0]
    embeddings_train = embeddings[:train_size, :]
    embeddings_test = embeddings[train_size:, :]
    y_train = data.y[:train_size]
    y_test = data.y[train_size:]

    # Evaluate model
    accuracy = evaluate_model(embeddings_train, embeddings_test, y_train, y_test, over_sampling=True)
    return accuracy

total_combinations = 1
for values in hyperparameter_grid.values():
    total_combinations *= len(values)
# Grid search
results = []
for params in tqdm(itertools.product(*hyperparameter_grid.values()), total=total_combinations):
    performance = train_and_evaluate(params, data)
    print("="*20)
    print(params)
    print(performance)
    results.append((params, performance))

# Find the best performing hyperparameters
best_params = max(results, key=lambda x: x[1])[0]



 33%|███▎      | 1/3 [00:00<00:01,  1.71it/s]

(30, 0.05, 300)
0.7


 67%|██████▋   | 2/3 [00:01<00:00,  1.73it/s]

(50, 0.05, 300)
0.8


100%|██████████| 3/3 [00:02<00:00,  1.47it/s]

(100, 0.05, 300)
0.8333333333333334



