In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m57.8 MB/s[0m eta [36m0:00:0

In [2]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm


In [3]:
# Load the preprocessed training data and product catalogue
train_data_original_preprocessed = pd.read_csv('/content/drive/MyDrive/Programming/Search Ranking/Data/preprocessed_train_data.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

product_catalogue_original_preprocessed = pd.read_csv('/content/drive/MyDrive/Programming/Search Ranking/Data/preprocessed_product_catalogue.csv')


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
# Remove rows with NaN values in the "query" column
train_data_original_preprocessed = train_data_original_preprocessed.dropna(subset=['query'])

# Select rows where query_locale is 'us'
train_data_english = train_data_original_preprocessed[train_data_original_preprocessed['query_locale'] == 'us']

# Select the first 3000 rows from the train dataset
train_data = train_data_english.head(3000)

# Select unique product_ids from the training data
unique_product_ids = train_data['product_id'].unique()

# Retrieve only the products from the product_catalogue that are also available in the training data
product_catalogue = product_catalogue_original_preprocessed[product_catalogue_original_preprocessed['product_id'].isin(unique_product_ids)]

# Save the preprocessed train_data to a new file
train_data.to_csv('/content/drive/MyDrive/Programming/Search Ranking/Data/small_preprocessed_train_data.csv', index=False)

# Save the preprocessed product_catalogue to a new file
product_catalogue.to_csv('/content/drive/MyDrive/Programming/Search Ranking/Data/small_preprocessed_product_catalogue1.csv', index=False)


In [5]:
# Split the original data into train and test sets
train_data_major, test_data = train_test_split(train_data_english, test_size=0.002, random_state=42)

In [6]:
# Print the updated DataFrame
test_data.head()


Unnamed: 0,query_id,query,query_locale,product_id,esci_label,product_title
155687,12257,vinyl record,us,B01GGJV5VQ,irrelevant,tom petty heartbreaker greatest hit
459933,12060,unique outdoor lighting,us,B07QCWW1YX,exact,outdoor solar garden decorative light 105 led ...
353089,16449,would choose reasonablypriced 50 short trenchc...,us,B073QH7RMR,irrelevant,alice wonderland nice something made sense sil...
335915,1904,bodycon jacket woman,us,B01L3PCP7Q,irrelevant,chouyatou woman fashion studded perfectly shap...
113941,5702,hotspot wifi device,us,B084BQK3HP,exact,alcatel link zone 4g lte global mw41nf2aofus1 ...


In [7]:
test_data.shape

(681, 6)

In [8]:
class RelevanceDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.queries = data['query'].tolist()
        self.products = data['product_title'].tolist()
        self.labels = data['esci_label'].tolist()
        self.tokenizer = tokenizer

        # Initialize the label encoder
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(self.labels)


    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        query = self.queries[idx]
        product = self.products[idx]
        label = self.labels[idx]

        encoded_inputs = self.tokenizer.encode_plus(
            query,
            product,
            add_special_tokens=True,
            padding='max_length',
            max_length=128,
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoded_inputs['input_ids'].squeeze()
        attention_mask = encoded_inputs['attention_mask'].squeeze()
        label = torch.tensor(label)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }


In [9]:
from torch.utils.data import TensorDataset
from tqdm import tqdm

# Define the device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Preprocess the test data
test_data['query'] = test_data['query'].fillna("").astype(str)

# Encode the preprocessed test data
test_encoded = tokenizer.batch_encode_plus(
    test_data['query'].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

# Create a TensorDataset for the test data
#test_dataset = TensorDataset(test_encoded['input_ids'], test_encoded['attention_mask'])
test_dataset = RelevanceDataset(test_data, tokenizer)

test_loader = DataLoader(test_dataset, batch_size=32)

# Load the trained model
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Programming/Search Ranking/bert_search_relevance_model_main')

# Set the model to evaluation mode
model.eval()

# Generate predictions
test_preds = []
progress_bar = tqdm(total=len(test_loader), desc="Predicting")

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        test_preds.extend(preds.cpu().numpy())

        progress_bar.update(1)

progress_bar.close()
# Decode the predicted labels
predicted_labels = test_dataset.label_encoder.inverse_transform(test_preds)

# Add the predicted labels to the test data
test_data['predicted_label'] = predicted_labels

# Print the predicted labels and actual labels
print("\nPredicted Labels:")
print(test_data['predicted_label'])
print("\nActual Labels:")
print(test_data['esci_label'])

# Calculate the accuracy by comparing the predicted labels with the actual labels
correct = (test_data['predicted_label'] == test_data['esci_label']).sum()
total = len(test_data)
accuracy = correct / total
print(f"Accuracy: {accuracy:.4f}")


Predicting: 100%|██████████| 22/22 [05:17<00:00, 14.44s/it]


Predicted Labels:
155687    irrelevant
459933    irrelevant
353089    irrelevant
335915    irrelevant
113941         exact
             ...    
615659    irrelevant
241689         exact
602812    irrelevant
385767    substitute
280179    irrelevant
Name: predicted_label, Length: 681, dtype: object

Actual Labels:
155687    irrelevant
459933         exact
353089    irrelevant
335915    irrelevant
113941         exact
             ...    
615659         exact
241689         exact
602812    irrelevant
385767         exact
280179         exact
Name: esci_label, Length: 681, dtype: object
Accuracy: 0.2364





In [10]:
def calculate_ndcg(actual_labels, predicted_labels, k=None):
    if k is not None:
        actual_labels = actual_labels[:k]
        predicted_labels = predicted_labels[:k]

    dcg_score = dcg(actual_labels, predicted_labels)
    idcg_score = idcg(actual_labels)

    if idcg_score == 0:
        return 0.0

    ndcg_score = dcg_score / idcg_score
    return ndcg_score


def dcg(labels, predicted_labels):
    dcg_score = 0.0
    for i, label in enumerate(labels):
        relevance = 1.0 if label == predicted_labels[i] else 0.0
        dcg_score += (2 ** relevance - 1) / np.log2(i + 2)
    return dcg_score


def idcg(labels):
    sorted_labels = sorted(labels, reverse=True)
    idcg_score = dcg(sorted_labels, sorted_labels)
    return idcg_score


In [11]:
# Calculate nDCG score
predicted_labels = test_data['predicted_label'].tolist()
actual_labels = test_data['esci_label'].tolist()

ndcg_score = calculate_ndcg(actual_labels, predicted_labels)
print(f"nDCG Score: {ndcg_score:.4f}")


nDCG Score: 0.2558
