In [1]:
import os
import torch
import torchvision
import math
import string
import collections
import json

import pandas as pd
import tensorflow as tf
import numpy as np
import torch.nn as nn
import dask.dataframe as dd

from torch import autocast
from datasets import Dataset
from tokenizers import Tokenizer
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW, Adam
from torch.nn import CosineSimilarity
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from sentence_transformers import SentenceTransformer, util, losses, InputExample, SentenceTransformerTrainer
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, get_scheduler, AutoModelForSequenceClassification, PreTrainedModel, PretrainedConfig
from huggingface_hub import login, hf_hub_download


2025-02-07 09:00:29.858695: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738940429.926553  247445 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738940430.355562  247445 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-07 09:00:31.074742: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Bi-Encoder

## Open Source Amazon Dataset (ESCI)

In [2]:
examples_path = os.path.join('..', 'data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('..', 'data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('..', 'data', 'shopping_queries_dataset_sources.csv')

examples = pd.read_parquet(examples_path)
products = pd.read_parquet(products_path)
sources = pd.read_csv(sources_path)

In [3]:
examples_products = dd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

examples_products = examples_products[examples_products['product_locale'] == 'us']

examples_products_small = examples_products[examples_products['small_version'] == 1]
examples_products_large = examples_products[examples_products['large_version'] == 1]


# Remove puncuation 
puncts = string.punctuation
def process_text(text_series, puncts):
    return text_series.apply(lambda text: ''.join(ch for ch in str(text) if ch not in puncts))
examples_products_large['query'] = examples_products_large['query'].map_partitions(process_text, puncts=puncts)

# encoding the esci labels 
label_mapping = {'E': 0, 
                 'S': 1, 
                 'C': 2, 
                 'I': 3}

examples_products_small['encoded_labels'] = examples_products_small['esci_label'].map(label_mapping).astype(int)
examples_products_large['encoded_labels'] = examples_products_large['esci_label'].map(label_mapping).astype(int)

biencoder_train_easy_examples = examples_products_small[examples_products_large['split'] == 'train']
biencoder_test_easy_examples = examples_products_small[examples_products_large['split'] == 'test']

biencoder_train_hard_examples = examples_products_large[examples_products_large['split'] == 'train']
biencoder_test_hard_examples = examples_products_large[examples_products_large['split'] == 'test']

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('esci_label', 'float64'))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('esci_label', 'float64'))



In [4]:
# Select required columns and convert to Pandas for processing

biencoder_train_easy_examples = biencoder_train_easy_examples[['query', 'product_title', 'encoded_labels']].compute()
biencoder_test_easy_examples = biencoder_test_easy_examples[['query', 'product_title', 'encoded_labels']].compute()
biencoder_train_hard_examples = biencoder_train_hard_examples[['query', 'product_title', 'encoded_labels']].compute()
biencoder_test_hard_examples = biencoder_test_hard_examples[['query', 'product_title', 'encoded_labels']].compute()

biencoder_df = examples_products_large[['query', 'product_title', 'encoded_labels']].compute()


  return self.func(*new_argspec)
  return self.func(*new_argspec)


## Custom Sentence Encoder and Dataset

In [5]:
class BiEncoderConfig(PretrainedConfig):
    model_type = "bi-encoder"  

    def __init__(self, encoder_name, num_classes=4, **kwargs):
        super().__init__(**kwargs)
        self.encoder_name = encoder_name
        self.num_classes = num_classes

In [6]:
class BiEncoderWithClassifier(PreTrainedModel):
    config_class = BiEncoderConfig

    def __init__(self, config):
        super().__init__(config)
        self.encoder = AutoModel.from_pretrained(config.encoder_name)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, config.num_classes)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.encoder(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]  # Assuming the encoder returns (sequence_output, pooled_output)
        logits = self.classifier(pooled_output)
        return logits

    def save_pretrained(self, save_directory, **kwargs):
        super().save_pretrained(save_directory, **kwargs)
        self.encoder.save_pretrained(save_directory)
        torch.save(self.classifier.state_dict(), os.path.join(save_directory, "classifier.pt"))

    @classmethod
    def from_pretrained(cls, pretrained_model_name, *model_args, **kwargs):
        config = kwargs.pop("config", None)
        if config is None:
            config = BiEncoderConfig.from_pretrained(config.encoder_name, *model_args, **kwargs)
        model = cls(config)
        model.encoder = AutoModel.from_pretrained(pretrained_model_name, *model_args, **kwargs)
        
        # Download classifier weights from Hugging Face Hub
        classifier_file = hf_hub_download(repo_id=pretrained_model_name, filename="classifier.pt")

        # Load classifier weights
        model.classifier.load_state_dict(torch.load(classifier_file, map_location=torch.device("cpu")))

        return model

In [7]:
class BiEncoderDataset(torch.utils.data.Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples.iloc[idx]
        return {
            "query": sample["query"],
            "product": sample["product_title"],
            "label": sample["encoded_labels"]
        }

## Untrained Query-Product Similarities

In [34]:
# Initialize model and tokenizer
model_name = "sentence-transformers/all-distilroberta-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = SentenceTransformer(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
batch_size = 1024  # Adjust based on available memory

# DataLoader for batching
query_loader = DataLoader(biencoder_df['query'].tolist(), batch_size=batch_size)
title_loader = DataLoader(biencoder_df['product_title'].tolist(), batch_size=batch_size)

In [None]:
# Encode queries and product titles
query_embeddings = []
for batch in tqdm(query_loader, desc="Encoding Queries"):
    batch_embeddings = model.encode(batch, convert_to_tensor=True)
    query_embeddings.append(batch_embeddings)
query_embeddings = torch.cat(query_embeddings, dim=0)

In [None]:
title_embeddings = []
for batch in tqdm(title_loader, desc="Encoding Product Titles"):
    batch_embeddings = model.encode(batch, convert_to_tensor=True)
    title_embeddings.append(batch_embeddings)
title_embeddings = torch.cat(title_embeddings, dim=0)

In [None]:
# Define a function to compute cosine similarity in batches
def compute_similarity_in_batches(query_embeddings, title_embeddings, batch_size=1024):
    scores = []
    for i in tqdm(range(0, len(query_embeddings), batch_size), desc="Computing Cosine Similarity"):
        # Take a batch of query embeddings
        query_batch = query_embeddings[i:i + batch_size]
        # Compute cosine similarity with all title embeddings
        batch_scores = util.cos_sim(query_batch, title_embeddings)
        # Append the diagonal scores (pairwise similarity for corresponding queries and titles)
        scores.append(batch_scores.diagonal().cpu().numpy())
    return torch.cat([torch.tensor(s) for s in scores], dim=0)

In [None]:
# Compute cosine similarity scores in batches
cosine_scores = compute_similarity_in_batches(query_embeddings, title_embeddings, batch_size=512)

In [None]:
# Add predicted scores to the dataframe
biencoder_df['predicted_scores'] = cosine_scores

In [None]:
# Save the results to a CSV file
output_file = "predicted_scores_biencoder.csv"
biencoder_df.to_csv(output_file, index=False)

## Model training

In [8]:
# Transpose the batch to convert from ([queries], [products]) to [(queries, products)]
def collate_fn(batch):
    queries = [example["query"] for example in batch]  # Extract queries
    products = [example["product"] for example in batch]  # Extract products
    labels = [example["label"] for example in batch]  # Extract labels
    return {"queries": queries, "products": products, "labels": labels}



In [9]:
# Tokenize and encode with AutoTokenizer
def tokenize_and_encode(texts, tokenizer, max_length=512):
    return tokenizer(
        texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
    )


### Sampling (For testing)

In [None]:
'''
#Use samples for testing training loop

total_rows = biencoder_train.shape[0].compute()

sample_fraction = 10000 / total_rows

biencoder_train_sample = biencoder_train.sample(frac=sample_fraction, random_state=2006)

biencoder_train_sample = biencoder_train_sample.compute()
'''

In [None]:
'''
total_rows2 = biencoder_test.shape[0].compute()

sample_fraction2 = 10000 / total_rows2

biencoder_test_sample = biencoder_test.sample(frac=sample_fraction2, random_state=2006)

biencoder_test_sample = biencoder_test_sample.compute()
'''

In [None]:
'''
# sampling loaders
train_df, dev_df = train_test_split(biencoder_train_sample, test_size=0.1, random_state=2006)
train_df = train_df.reset_index(drop=True)
dev_df = dev_df.reset_index(drop=True)

#print(train_df.head())
#print(dev_df.head())

train_df = BiEncoderDataset(train_df)
dev_df = BiEncoderDataset(dev_df)
'''

### Easy Examples dataset

In [47]:
# Create split for training
easy_train_df, easy_dev_df = train_test_split(biencoder_train_easy_examples, test_size=0.1, random_state=2006)
easy_train_df = easy_train_df.reset_index(drop=True)
easy_dev_df = easy_dev_df.reset_index(drop=True)

# Convert to Custom Dataset
easy_train_df = BiEncoderDataset(easy_train_df)
easy_dev_df = BiEncoderDataset(easy_dev_df)

In [48]:
batch_size = 64

# Create DataLoader
easy_train_loader = DataLoader(
    easy_train_df, 
    batch_size=batch_size, 
    shuffle=True,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=False
)

easy_dev_loader = DataLoader(
    easy_dev_df,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=False
)

In [49]:
# Hyperparameters
epochs = 3
learning_rate = 5e-5

In [50]:
# Initialize model, tokenizer, and optimizer
encoder_name = "sentence-transformers/all-distilroberta-v1"
config = BiEncoderConfig(encoder_name=encoder_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiEncoderWithClassifier(config).to(device)
tokenizer = AutoTokenizer.from_pretrained(config.encoder_name)

In [51]:
# Define loss function for bi-encoder
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(easy_train_loader) * epochs

# Add warmup steps 10% of training steps
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=int(0.1 * num_training_steps), num_training_steps=num_training_steps)

In [52]:
# Loss function
cos_sim = CosineSimilarity(dim=1)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    with tqdm(total=len(easy_train_loader), desc=f"Epoch {epoch + 1}/{epochs}", unit="batch") as progress_bar:
        for batch in easy_train_loader:
            queries = batch["queries"]
            products = batch["products"]
            labels = torch.tensor(batch["labels"], dtype=torch.long).to(device)

            # Tokenize inputs
            query_inputs = tokenizer(queries, padding=True, truncation=True, return_tensors="pt").to(device)
            product_inputs = tokenizer(products, padding=True, truncation=True, return_tensors="pt").to(device)

            # Forward pass
            query_logits = model(query_inputs["input_ids"], attention_mask=query_inputs["attention_mask"])
            product_logits = model(product_inputs["input_ids"], attention_mask=product_inputs["attention_mask"])
            logits = (query_logits + product_logits) / 2

            # Compute loss
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            progress_bar.update(1)
            progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(easy_train_loader)
    print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")

    # Validation loop
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        with tqdm(total=len(easy_dev_loader), desc="Evaluating", unit="batch") as progress_bar:
            for batch in easy_dev_loader:
                queries = batch["queries"]
                products = batch["products"]
                labels = torch.tensor(batch["labels"], dtype=torch.long).to(device)

                # Tokenize inputs
                query_inputs = tokenizer(queries, padding=True, truncation=True, return_tensors="pt").to(device)
                product_inputs = tokenizer(products, padding=True, truncation=True, return_tensors="pt").to(device)

                # Forward pass
                query_logits = model(query_inputs["input_ids"], attention_mask=query_inputs["attention_mask"])
                product_logits = model(product_inputs["input_ids"], attention_mask=product_inputs["attention_mask"])
                logits = (query_logits + product_logits) / 2

                # Collect predictions and labels
                predictions = torch.argmax(logits, dim=1)
                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                progress_bar.update(1)

    # Compute metrics
    f1 = f1_score(all_labels, np.round(all_preds), average="micro")
    accuracy = accuracy_score(all_labels, np.round(all_preds))

    print(f"Validation F1: {f1:.4f}, Accuracy: {accuracy:.4f}")


Epoch 1/3: 100%|██████████| 5902/5902 [15:21<00:00,  6.41batch/s, loss=0.916]


Epoch 1 - Average Loss: 1.0522


Evaluating: 100%|██████████| 656/656 [00:35<00:00, 18.54batch/s]


Validation F1: 0.5704, Accuracy: 0.5704


Epoch 2/3:   7%|▋         | 417/5902 [01:04<14:50,  6.16batch/s, loss=1.04] 

## Save trained bi-encoder on easy examples

In [18]:
model.save_pretrained("all-distilroberta-biencoder-esci-v1")
tokenizer.save_pretrained("all-distilroberta-biencoder-esci-v1")

# Define the repository name
repo_name = "twburns/all-distilroberta-biencoder-esci-v1"

# Push to HuggingFace hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

classifier.pt:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/twburns/all-distilroberta-biencoder-esci-v1/commit/7048e2190240de4274febbef2f30f41d0700460d', commit_message='Upload tokenizer', commit_description='', oid='7048e2190240de4274febbef2f30f41d0700460d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/twburns/all-distilroberta-biencoder-esci-v1', endpoint='https://huggingface.co', repo_type='model', repo_id='twburns/all-distilroberta-biencoder-esci-v1'), pr_revision=None, pr_num=None)

In [12]:
# Define the repository name
repo_name = "twburns/all-distilroberta-biencoder-esci-v1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the configuration
config = BiEncoderConfig(encoder_name=repo_name)

# Load the model
model = BiEncoderWithClassifier.from_pretrained(repo_name, config=config).to(device)
#print(model.config._name_or_path)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(repo_name)

  model.classifier.load_state_dict(torch.load(classifier_file, map_location=torch.device("cpu")))


tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

## Multi-class Classification Using Trained Model (easy examples)

In [13]:
biencoder_test_easy_examples = BiEncoderDataset(biencoder_test_easy_examples)

# Prepare the data (query-product pairs)
query_product_pairs = list(zip(
    [sample["query"] for sample in biencoder_test_easy_examples],  # Extract queries
    [sample["product"] for sample in biencoder_test_easy_examples]  # Extract products
))
actual_labels = [sample["label"] for sample in biencoder_test_easy_examples]  # Extract labels


In [14]:
# Set model to evaluation mode
model.eval()

batch_size = 8
dataloader_easy = torch.utils.data.DataLoader(
    biencoder_test_easy_examples,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

In [33]:
# predictions
predicted_classes = []
actual_labels_list = [] 
probabilities_list = []

with torch.no_grad():
    for batch in tqdm(dataloader_easy, desc="Predicting"):
        batch_queries = batch['queries']
        batch_products = batch['products']
        actual_labels = batch['labels']
        
        # Flatten actual_labels to 1D list
        actual_labels_list.extend(actual_labels)

        # Tokenize queries and products separately
        query_inputs = tokenizer(
            list(batch_queries),
            truncation=True,
            padding=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)

        product_inputs = tokenizer(
            list(batch_products),
            truncation=True,
            padding=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)

        with autocast(device_type="cuda" if torch.cuda.is_available() else "cpu"):
            # Pass queries and products through the model to get embeddings
            query_embeddings = model(**query_inputs)
            product_embeddings = model(**product_inputs)
            
            # Map similarity scores to 4-class logits
            classifier = torch.nn.Linear(1, 4).to(device) 
            
            # Compute cosine similarity and generate probabilities
            similarity_scores = torch.nn.functional.cosine_similarity(query_embeddings, product_embeddings)
            similarity_scores = similarity_scores.unsqueeze(1)  # Shape: (batch_size, 1)
            logits = classifier(similarity_scores)  # (batch_size, 4)
            probabilities = torch.nn.functional.softmax(logits, dim=1)  # Apply softmax across 4 classes

            # Store all 4 probability values
            probabilities_list.extend(probabilities.cpu().tolist())  # Ensure full probabilities are saved

            # Get predictions from the max class
            predictions = torch.argmax(probabilities, dim=1).tolist()
            predicted_classes.extend(predictions)


        torch.cuda.empty_cache()

# Map predictions back to labels
label_mapping = {0: "E", 1: "S", 2: "C", 3: "I"}

# Print classification report
print("Classification Report:")
print(classification_report(
    actual_labels_list,  # True labels
    predicted_classes,  # Predicted labels
    target_names=[label_mapping[i] for i in range(len(label_mapping))]
))

# Compute F1 score
micro_f1 = f1_score(actual_labels_list, predicted_classes, average="micro")
print(f"\nMicro F1 Score: {micro_f1:.4f}")


Predicting: 100%|██████████| 22713/22713 [05:44<00:00, 65.94it/s]


Classification Report:
              precision    recall  f1-score   support

           E       0.44      0.25      0.32     79708
           S       0.35      0.25      0.29     63563
           C       0.04      0.24      0.07      8099
           I       0.18      0.26      0.21     30331

    accuracy                           0.25    181701
   macro avg       0.25      0.25      0.22    181701
weighted avg       0.35      0.25      0.28    181701


Micro F1 Score: 0.2530


In [34]:
probabilities_array = np.array(probabilities_list)  # Convert to NumPy array
print(f"Shape of probabilities_array: {probabilities_array.shape}")  # Should be (181701, 4)
print(f"Top probability_array results: {probabilities_array[:10]}")

Shape of probabilities_array: (181701, 4)
Top probability_array results: [[0.08459408 0.13777988 0.14796066 0.62966537]
 [0.08721893 0.14184707 0.1432391  0.6276949 ]
 [0.08557259 0.13923752 0.14627635 0.62891352]
 [0.11800675 0.18809147 0.09985806 0.59404367]
 [0.12020866 0.1912856  0.09739549 0.59111023]
 [0.09962123 0.16063881 0.12349726 0.61624271]
 [0.08911464 0.14471799 0.1399918  0.62617552]
 [0.11898265 0.18953125 0.09864008 0.59284604]
 [0.33859253 0.21297489 0.37957612 0.06885646]
 [0.3407546  0.20491958 0.38386971 0.07045616]]


In [35]:
be_output_df = pd.DataFrame({
    "query": [pair[0] for pair in query_product_pairs],
    "product_title": [pair[1] for pair in query_product_pairs],
    "actual_label": actual_labels_list,
    "predicted_label": predicted_classes
})

# Ensure probabilities_list is correctly structured
probabilities_array = np.array(probabilities_list)  # Convert to NumPy array for easier reshaping

# Check shape before creating DataFrame
if probabilities_array.shape[1] != 4:
    raise ValueError(f"Expected probabilities array to have shape (_,4), but got {probabilities_array.shape}")

prob_df = pd.DataFrame(probabilities_array, columns=["E_confidence", "S_confidence", "C_confidence", "I_confidence"])

# Concatenate the probability columns
be_output_df = pd.concat([be_output_df, prob_df], axis=1)

be_output_file = "distilbert-ce-esci-test-easy.csv"
be_output_df.to_csv(be_output_file, index=False)
print(f"Predictions saved to {be_output_file}")

Predictions saved to distilbert-ce-esci-test-easy.csv


In [36]:
import matplotlib.pyplot as plt

true_labels = actual_labels  # From the dataset
predicted_labels = predicted_classes  # From the predictions

# Create the confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
cm = confusion_matrix(true_labels, predicted_labels)

# Define label mapping
label_mapping = {0: "E", 1: "S", 2: "C", 3: "I"}
class_names = [label_mapping[i] for i in range(len(label_mapping))]

# Plot the confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix for Cross-Encoder Model')
plt.show()

NameError: name 'plt' is not defined

## Difficult examples Training


In [None]:
# Create split for training
hard_train_df, hard_dev_df = train_test_split(biencoder_train_hard_examples, test_size=0.1, random_state=2006)
hard_train_df = hard_train_df.reset_index(drop=True)
hard_dev_df = hard_dev_df.reset_index(drop=True)

# Convert to Custom Dataset
hard_train_df = BiEncoderDataset(hard_train_df)
hard_dev_df = BiEncoderDataset(hard_dev_df)

In [None]:
# Create DataLoader
hard_train_loader = DataLoader(
    hard_train_df, 
    batch_size=batch_size, 
    shuffle=True,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=False
)

hard_dev_loader = DataLoader(
    hard_dev_df,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    pin_memory=True,
    persistent_workers=False
)

In [None]:
# Hyperparameters
epochs = 3
learning_rate = 5e-5


In [None]:
# Initialize model (trained on easy examples), tokenizer, and optimizer
encoder_name = "twburns/all-distilroberta-biencoder-esci-v1"
config = BiEncoderConfig(encoder_name=encoder_name)
model = BiEncoderWithClassifier(config).to(device)
tokenizer = AutoTokenizer.from_pretrained(config.encoder_name)

In [None]:
# Define loss function for bi-encoder
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(hard_train_loader) * epochs

# Add warmup steps 10% of training steps
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=int(0.1 * num_training_steps), num_training_steps=num_training_steps)

In [None]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    with tqdm(total=len(hard_train_loader), desc=f"Epoch {epoch + 1}/{epochs}", unit="batch") as progress_bar:
        for batch in hard_train_loader:
            queries = batch["queries"]
            products = batch["products"]
            labels = torch.tensor(batch["labels"], dtype=torch.long).to(device)

            # Tokenize inputs
            query_inputs = tokenizer(queries, padding=True, truncation=True, return_tensors="pt").to(device)
            product_inputs = tokenizer(products, padding=True, truncation=True, return_tensors="pt").to(device)

            # Forward pass
            query_logits = model(query_inputs["input_ids"], attention_mask=query_inputs["attention_mask"])
            product_logits = model(product_inputs["input_ids"], attention_mask=product_inputs["attention_mask"])
            logits = (query_logits + product_logits) / 2

            # Compute loss
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            progress_bar.update(1)
            progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(hard_train_loader)
    print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")

    # Validation loop
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        with tqdm(total=len(hard_dev_loader), desc="Evaluating", unit="batch") as progress_bar:
            for batch in hard_dev_loader:
                queries = batch["queries"]
                products = batch["products"]
                labels = torch.tensor(batch["labels"], dtype=torch.long).to(device)

                # Tokenize inputs
                query_inputs = tokenizer(queries, padding=True, truncation=True, return_tensors="pt").to(device)
                product_inputs = tokenizer(products, padding=True, truncation=True, return_tensors="pt").to(device)

                # Forward pass
                query_logits = model(query_inputs["input_ids"], attention_mask=query_inputs["attention_mask"])
                product_logits = model(product_inputs["input_ids"], attention_mask=product_inputs["attention_mask"])
                logits = (query_logits + product_logits) / 2

                # Collect predictions and labels
                predictions = torch.argmax(logits, dim=1)
                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                progress_bar.update(1)

    # Compute metrics
    f1 = f1_score(all_labels, np.round(all_preds), average="micro")
    accuracy = accuracy_score(all_labels, np.round(all_preds))

    print(f"Validation F1: {f1:.4f}, Accuracy: {accuracy:.4f}")

## Save trained bi-encoder on hard examples

In [None]:
model.save_pretrained("all-distilroberta-biencoder-esci-v2")
tokenizer.save_pretrained("all-distilroberta-biencoder-esci-v2")

# Define the repository name
repo_name = "twburns/all-distilroberta-biencoder-esci-v2"

# Push to HuggingFace hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

In [None]:
# Load the configuration
config = BiEncoderConfig.from_pretrained(repo_name)

# Load the model
model = BiEncoderWithClassifier.from_pretrained(repo_name, config=config).to(device)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(repo_name)

## Multi-class Classification Using Trained Model (easy examples)

In [None]:
biencoder_test_hard_examples = BiEncoderDataset(biencoder_test_hard_examples)

# Prepare the data (query-product pairs)
query_product_pairs = list(zip(
    [sample["query"] for sample in biencoder_test_hard_examples],  # Extract queries
    [sample["product"] for sample in biencoder_test_hard_examples]  # Extract products
))
actual_labels = [sample["label"] for sample in biencoder_test_hard_examples]  # Extract labels

In [None]:
# Set model to evaluation mode
model.eval()

batch_size = 8
dataloader_hard = torch.utils.data.DataLoader(
    biencoder_test_hard_examples,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

In [None]:
# predictions
predicted_classes = []
actual_labels_list = [] 

with torch.no_grad():
    for batch in tqdm(dataloader_hard, desc="Predicting"):
        batch_queries = batch['queries']
        batch_products = batch['products']
        actual_labels = batch['labels']
        
        # Flatten actual_labels to 1D list
        actual_labels_list.extend(actual_labels)

        # Tokenize queries and products separately
        query_inputs = tokenizer(
            list(batch_queries),
            truncation=True,
            padding=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)

        product_inputs = tokenizer(
            list(batch_products),
            truncation=True,
            padding=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)

        with autocast(device_type="cuda" if torch.cuda.is_available() else "cpu"):
            # Pass queries and products through the model to get embeddings
            query_embeddings = model(**query_inputs)
            product_embeddings = model(**product_inputs)

            # Compute cosine similarity between query and product embeddings
            similarity_scores = torch.nn.functional.cosine_similarity(query_embeddings, product_embeddings)

            # Classify similarity score into one of the classes (using a simple threshold for example)
            predictions = torch.argmax(similarity_scores.unsqueeze(-1), dim=1).tolist()  # Simulating multi-class
            predicted_classes.extend(predictions)

        torch.cuda.empty_cache()

# Map predictions back to labels
label_mapping = {0: "E", 1: "S", 2: "C", 3: "I"}

# Print classification report
print("Classification Report:")
print(classification_report(
    actual_labels_list,  # True labels
    predicted_classes,  # Predicted labels
    target_names=[label_mapping[i] for i in range(len(label_mapping))]
))

# Compute F1 score
micro_f1 = f1_score(actual_labels_list, predicted_classes, average="micro")
print(f"\nMicro F1 Score: {micro_f1:.4f}")


In [None]:
be_output_df = pd.DataFrame({
    "query": [pair[0] for pair in query_product_pairs],
    "product_title": [pair[1] for pair in query_product_pairs],
    "actual_label": actual_labels_list,
    "predicted_label": predicted_classes
})

be_output_file = "distilbert-ce-esci-test-hard.csv"
be_output_df.to_csv(be_output_file, index=False)
print(f"Predictions saved to {be_output_file}")

In [None]:
true_labels = actual_labels  # From the dataset
predicted_labels = predicted_classes  # From the predictions

# Create the confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
cm = confusion_matrix(true_labels, predicted_labels)

# Define label mapping
label_mapping = {0: "E", 1: "S", 2: "C", 3: "I"}
class_names = [label_mapping[i] for i in range(len(label_mapping))]

# Plot the confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix for Cross-Encoder Model')
plt.show()