In [1]:
import torch
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sentence_transformers.readers import InputExample
from tqdm import tqdm
from transformers import AdamW, get_scheduler, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import f1_score, accuracy_score
import sys
import os
import pandas as pd
import dask.dataframe as dd
from huggingface_hub import login
sys.path.append(os.path.abspath('..'))
from config import TOKEN_1

In [2]:
examples_path = os.path.join('..', 'data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('..', 'data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('..', 'data', 'shopping_queries_dataset_sources.csv')

examples = dd.read_parquet(examples_path)
products = dd.read_parquet(products_path)
sources = dd.read_csv(sources_path)

In [3]:
examples_products = dd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

examples_products = examples_products[examples_products['product_locale'] == 'us']

task_2 = examples_products[examples_products['large_version'] == 1]

label_mapping = {'E': 0, 
                 'S': 1, 
                 'C': 2, 
                 'I': 3}

task_2['encoded_labels'] = task_2['esci_label'].map(label_mapping).astype(int)

task_2_train = task_2[task_2['split'] == 'train']
task_2_test = task_2[task_2['split'] == 'test']

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('esci_label', 'float64'))



In [4]:
task_2_train = task_2_train.compute()
task_2_test = task_2_test.compute()

In [5]:
# total_rows = task_2_train.shape[0].compute()

# sample_fraction = 10000 / total_rows

# task_2_train_sample = task_2_train.sample(frac=sample_fraction, random_state=21)

# task_2_train_sample = task_2_train_sample.compute()

In [6]:
# total_rows2 = task_2_test.shape[0].compute()

# sample_fraction2 = 10000 / total_rows2

# task_2_test_sample = task_2_test.sample(frac=sample_fraction2, random_state=21)

# task_2_test_sample = task_2_test_sample.compute()

In [7]:
def collate_fn(batch):
    # Extract texts and labels from InputExample objects
    texts = [(example.texts[0], example.texts[1]) for example in batch]
    labels = [example.label for example in batch]
    return {"texts": texts, "labels": labels}

In [8]:
def prepare_data(dataset):
    samples = []
    for _, row in dataset.iterrows():
        query = row["query"]
        product = row["product_title"]
        label = int(row["encoded_labels"])
        samples.append(InputExample(texts=[query, product], label=label))
    return samples

train_samples, dev_samples = train_test_split(task_2_train, test_size=0.1, random_state=21)
train_samples = prepare_data(train_samples)
dev_samples = prepare_data(dev_samples)

batch_size = 32
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
dev_dataloader = DataLoader(dev_samples, shuffle=False, batch_size=batch_size, collate_fn=collate_fn)

In [9]:
model_name = "sentence-transformers/all-distilroberta-v1"
model = CrossEncoder(model_name, num_labels=4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-distilroberta-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [10]:
print(device)

cuda


In [11]:
print(torch.cuda.get_device_name(0))

Tesla V100-PCIE-32GB


In [12]:
optimizer = AdamW(model.model.parameters(), lr=5e-5)
num_training_steps = len(train_dataloader) * 3  # 3 epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=int(0.1 * num_training_steps), num_training_steps=num_training_steps)

# Loss Function
loss_fn = torch.nn.CrossEntropyLoss()



In [13]:
epochs = 3

for epoch in range(epochs):
    model.model.train()
    total_loss = 0

    with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{epochs}", unit="batch") as progress_bar:
        for batch in train_dataloader:
            sentences = batch["texts"]
            labels = torch.tensor(batch["labels"], dtype=torch.long).to(device)

            inputs = model.tokenizer(
                [pair[0] for pair in sentences],  # Query
                [pair[1] for pair in sentences],  # Product Title
                truncation=True,
                padding=True,
                max_length=512,
                return_tensors="pt",
            ).to(device)

            # forward pass
            outputs = model.model(**inputs)
            logits = outputs.logits

            # compute loss
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            # backward pass + optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            # update progress bar
            progress_bar.update(1)
            progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")

    # validation
    model.model.eval()
    all_preds, all_labels = [], []

    # validation Loop
    with torch.no_grad():
        with tqdm(total=len(dev_dataloader), desc="Evaluating", unit="batch") as progress_bar:
            for batch in dev_dataloader:
                sentences = batch["texts"]
                labels = torch.tensor(batch["labels"], dtype=torch.long).to(device)

                # tokenize sentences
                inputs = model.tokenizer(
                    [pair[0] for pair in sentences],
                    [pair[1] for pair in sentences],
                    truncation=True,
                    padding=True,
                    max_length=128,
                    return_tensors="pt",
                ).to(device)

                # forward pass
                outputs = model.model(**inputs)
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)

                # collect predictions and labels
                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                progress_bar.update(1)

    # compute metrics
    f1 = f1_score(all_labels, all_preds, average="micro")
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch + 1} - Validation F1: {f1:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/3: 100%|██████████| 39180/39180 [45:40<00:00, 14.30batch/s, loss=0.606]


Epoch 1 - Average Loss: 0.6130


Evaluating: 100%|██████████| 4354/4354 [01:33<00:00, 46.38batch/s]


Epoch 1 - Validation F1: 0.8023, Accuracy: 0.8023


Epoch 2/3: 100%|██████████| 39180/39180 [45:31<00:00, 14.35batch/s, loss=0.324] 


Epoch 2 - Average Loss: 0.4713


Evaluating: 100%|██████████| 4354/4354 [01:32<00:00, 46.95batch/s]


Epoch 2 - Validation F1: 0.8371, Accuracy: 0.8371


Epoch 3/3: 100%|██████████| 39180/39180 [45:30<00:00, 14.35batch/s, loss=0.328] 


Epoch 3 - Average Loss: 0.3827


Evaluating: 100%|██████████| 4354/4354 [01:32<00:00, 46.94batch/s]


Epoch 3 - Validation F1: 0.8465, Accuracy: 0.8465


In [14]:
output_dir = "trained_crossencoder_model"
model.save(output_dir)

In [15]:
login(token=TOKEN_1)

In [16]:
tokenizer = AutoTokenizer.from_pretrained(output_dir)
trained_model = AutoModelForSequenceClassification.from_pretrained(output_dir)

In [17]:
repo_name = "sllawlis/distilroberta-ce-esci"
trained_model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sllawlis/distilroberta-ce-esci/commit/991b73f1bba0d7f3e1bc9d503e57e63f2fbce6aa', commit_message='Upload tokenizer', commit_description='', oid='991b73f1bba0d7f3e1bc9d503e57e63f2fbce6aa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sllawlis/distilroberta-ce-esci', endpoint='https://huggingface.co', repo_type='model', repo_id='sllawlis/distilroberta-ce-esci'), pr_revision=None, pr_num=None)