In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import AdamW
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSequenceClassification, DistilBertModel, DistilBertTokenizer
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder
import os
import pandas as pd
import pyarrow
import dask.dataframe as dd
import numpy as np
from collections import Counter
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

  torch.utils._pytree._register_pytree_node(
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
examples_path = os.path.join('..', 'data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('..', 'data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('..', 'data', 'shopping_queries_dataset_sources.csv')

examples = dd.read_parquet(examples_path)
products = dd.read_parquet(products_path)
sources = dd.read_csv(sources_path)

In [3]:
examples_products = dd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

examples_products = examples_products[examples_products['product_locale'] == 'us']

task_2 = examples_products[examples_products['large_version'] == 1]

label_mapping = {'E': 0, 
                 'S': 1, 
                 'C': 2, 
                 'I': 3}

task_2['encoded_labels'] = task_2['esci_label'].map(label_mapping).astype(int)

task_2_train = task_2[task_2['split'] == 'train']
task_2_test = task_2[task_2['split'] == 'test']

In [4]:
class ESCIDatasetForCrossEncoder(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe.reset_index(drop=True)

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        query = row["query"]
        product = row["product_title"]
        label = row["encoded_labels"]
        return {"texts": [query, product], "label": label}

In [5]:
total_rows = task_2_train.shape[0].compute()

sample_fraction = 10000 / total_rows

task_2_train_sample = task_2_train.sample(frac=sample_fraction, random_state=42)

task_2_train_sample = task_2_train_sample.compute()

In [6]:
total_rows2 = task_2_test.shape[0].compute()

sample_fraction2 = 10000 / total_rows2

task_2_test_sample = task_2_test.sample(frac=sample_fraction2, random_state=42)

task_2_test_sample = task_2_test_sample.compute()

In [7]:
train_dataset = ESCIDatasetForCrossEncoder(task_2_train_sample)
test_dataset = ESCIDatasetForCrossEncoder(task_2_test_sample)

In [8]:
def custom_collate_fn(batch):
    texts = [item["texts"] for item in batch]
    labels = [item["label"] for item in batch]
    return {"texts": texts, "label": labels}

batch_size = 1024

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [10]:
model = CrossEncoder('sentence-transformers/all-distilroberta-v1', num_labels=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.model.to(device)
optimizer = AdamW(model.model.parameters(), lr=5e-5)

# def custom_collate_fn(batch):
#     texts = [item["texts"] for item in batch]
#     labels = [item["label"] for item in batch]
#     return {"texts": texts, "label": labels}

# train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)

def train_model(model, dataloader, epochs=3):
    model.model.train()

    for epoch in range(epochs):
        total_loss = 0

        # using tqdm for ipynb progress bars
        for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}"):
            # batch sentences as list of lists
            sentences = batch["texts"]
            labels = torch.tensor(batch["label"]).to(device)

            inputs = model.tokenizer(
                sentences,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=128  # Adjust based on your data
            ).to(device)

            outputs = model.model(**inputs, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1} completed. Average Loss: {total_loss / len(dataloader):.4f}")

train_model(model, train_dataloader, epochs=3)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-distilroberta-v1 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 1 completed. Average Loss: 0.7919


Epoch 2:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 2 completed. Average Loss: 0.6424


Epoch 3:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 3 completed. Average Loss: 0.4811
