In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m45.1 MB/s[0m eta [36m0:00:0

In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np


In [None]:
# Load and preprocess the training data
train_data_original = pd.read_csv('/content/drive/MyDrive/Programming/Search Ranking/train-v0.3.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

product_catalogue_original = pd.read_csv('/content/drive/MyDrive/Programming/Search Ranking/product_catalogue-v0.3.csv')


In [None]:
# Load the preprocessed training data and product catalogue
train_data_original_preprocessed = pd.read_csv('/content/drive/MyDrive/Programming/Search Ranking/Data/preprocessed_train_data.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

product_catalogue_original_preprocessed = pd.read_csv('/content/drive/MyDrive/Programming/Search Ranking/Data/preprocessed_product_catalogue.csv')


In [None]:
# Remove rows with NaN values in the "query" column
train_data_original_preprocessed = train_data_original_preprocessed.dropna(subset=['query'])

# Select rows where query_locale is 'us'
train_data_english = train_data_original_preprocessed[train_data_original_preprocessed['query_locale'] == 'us']

# Select the first 3000 rows from the train dataset
train_data = train_data_english.head(3000)

# Select unique product_ids from the training data
unique_product_ids = train_data['product_id'].unique()

# Retrieve only the products from the product_catalogue that are also available in the training data
product_catalogue = product_catalogue_original_preprocessed[product_catalogue_original_preprocessed['product_id'].isin(unique_product_ids)]


In [None]:
# Split the data into train and test sets
train_data_major, test_data = train_test_split(train_data_original, test_size=0.2, random_state=42)


# Merge the train_data and product_catalogue on product_id
train_data = train_data.merge(product_catalogue[['product_id', 'product_title']], on='product_id', how='left')


In [None]:
class RelevanceDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.queries = data['query'].tolist()
        self.products = data['product_title'].tolist()
        self.labels = data['esci_label'].tolist()
        self.tokenizer = tokenizer

        # Initialize the label encoder
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(self.labels)


    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        query = self.queries[idx]
        product = self.products[idx]
        label = self.labels[idx]

        encoded_inputs = self.tokenizer.encode_plus(
            query,
            product,
            add_special_tokens=True,
            padding='max_length',
            max_length=128,
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoded_inputs['input_ids'].squeeze()
        attention_mask = encoded_inputs['attention_mask'].squeeze()
        label = torch.tensor(label)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }


In [None]:

# Split the data into train and validation sets
train_df, val_df = train_test_split(train_data, test_size=0.2, random_state=42)

# Create the datasets and data loaders
train_dataset = RelevanceDataset(train_df, tokenizer)
val_dataset = RelevanceDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Set up the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
accumulation_steps = 4  # Accumulate gradients over 4 batches
epochs = 5
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    accumulated_steps = 0  # Counter for accumulated steps

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        loss = loss / accumulation_steps  # Scale the loss
        loss.backward()

        accumulated_steps += 1

        if accumulated_steps % accumulation_steps == 0:
            # Perform optimization step after accumulating gradients
            optimizer.step()
            optimizer.zero_grad()

    avg_train_loss = train_loss / len(train_loader)

    # Validation loop
    model.eval()
    val_loss = 0.0
    total_preds = []
    total_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            total_preds.extend(preds.cpu().numpy())
            total_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)

    correct = 0
    total = len(total_preds)

    for pred, label in zip(total_preds, total_labels):
        if pred == label:
            correct += 1

    accuracy = correct / total

    # Print progress and evaluation metrics
    print(f'Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f} - Accuracy: {accuracy:.4f}')

# Save the fine-tuned model
model.save_pretrained('bert_relevance_model v2.0')




Epoch 1/5 - Train Loss: 1.1855 - Val Loss: 1.1673 - Accuracy: 0.4083
Epoch 2/5 - Train Loss: 1.1685 - Val Loss: 1.1183 - Accuracy: 0.4181
Epoch 3/5 - Train Loss: 1.1324 - Val Loss: 1.1103 - Accuracy: 0.5159
Epoch 4/5 - Train Loss: 1.0883 - Val Loss: 1.0292 - Accuracy: 0.5183
Epoch 5/5 - Train Loss: 1.0347 - Val Loss: 1.0493 - Accuracy: 0.5379


In [None]:
model.save_pretrained('/content/drive/MyDrive/Programming/Search Ranking/Models/bert_search_relevance_modelv2.0')
