## Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip3 install pandas
!pip3 install transformers
!pip3 install pyarrow
!pip3 install fastparquet
!pip3 install sentence_transformers
!pip3 install torch
!pip3 install faiss-cpu
!pip3 install openai
!pip3 install nltk
!pip3 install lingua-language-detector

Mounted at /content/drive
Collecting fastparquet
  Downloading fastparquet-2024.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.8.3 fastparquet-2024.2.0
Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-

## Get Data Into df_task_1 and Split Into Train/Test

In [None]:
import pandas as pd
import numpy as np

df_examples = pd.read_parquet('/content/drive/My Drive/Grainger_Interview/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('/content/drive/My Drive/Grainger_Interview/shopping_queries_dataset_products.parquet')


# Merge datasets
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale', 'product_id'],
    right_on=['product_locale', 'product_id']
)


# Filter for task 1 and split into train and test
df_task_1 = df_examples_products[df_examples_products["small_version"] == 1]
df_task_1_train = df_task_1[df_task_1["split"] == "train"]
df_task_1_test = df_task_1[df_task_1["split"] == "test"]



# Extract unique products and combine relevant text fields
unique_products = df_task_1.drop_duplicates(subset=['product_id', 'product_locale'])
unique_products['combined_text'] = unique_products.apply(
    lambda row: ' '.join([
        str(row['product_title']),
        str(row['product_description']),
        str(row['product_bullet_point'])
    ]), axis=1
)

# Extract unique queries
unique_queries_train = df_task_1_train[['query', 'query_id']].drop_duplicates()
unique_queries_test = df_task_1_test[['query', 'query_id']].drop_duplicates()


print(len(unique_queries_train))
print(len(unique_queries_test))

## Preprocessing + Tokenization (Currently Unused)

In [None]:
# import re
# import string
# import pandas as pd
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer
# import nltk
# from tqdm import tqdm
# from lingua import Language, LanguageDetectorBuilder
# from sentence_transformers import SentenceTransformer, InputExample, losses
# from torch.utils.data import DataLoader
# import numpy as np
# import faiss

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# # Preprocessing function
# def preprocess_text(text):
#     text = text.lower()
#     text = re.sub(f'[{string.punctuation}]', '', text)
#     tokens = word_tokenize(text)
#     stop_words = set(stopwords.words('english'))
#     tokens = [word for word in tokens if word not in stop_words]
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(word) for word in tokens]
#     return ' '.join(tokens)

# languages = [Language.ENGLISH, Language.SPANISH, Language.JAPANESE]
# detector = LanguageDetectorBuilder.from_languages(*languages).build()


# def detect_language(text):
#     language = detector.detect_language_of(text)
#     if language is not None:
#         if language.name == "ENGLISH":
#             return "us"
#         elif language.name == "SPANISH":
#             return "es"
#         elif language.name == "JAPANESE":
#             return "jp"
#     return "us"


# Skip language detection and tokenization for now
unique_products['cleaned_combined_text'] = unique_products['combined_text']
unique_queries_train['cleaned_query_text'] = unique_queries_train['query']
unique_queries_test['cleaned_query_text'] = unique_queries_test['query']
unique_queries_test['language'] = "en"


## Train Embedding Model Using Pairs From Train Set

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader
import torch
import random

model = SentenceTransformer('all-MiniLM-L6-v2')


df_merged = df_task_1_train.merge(unique_queries_train[['query_id', 'cleaned_query_text']], on='query_id', how='left')
df_merged = df_merged.merge(unique_products[['product_id', 'cleaned_combined_text']], on='product_id', how='left')


train_examples = [
    InputExample(texts=[q, p]) for q, p in zip(df_merged['cleaned_query_text'], df_merged['cleaned_combined_text'])
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model=model)


# Fine tune conservatively
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    warmup_steps=100
)

# Save the trained model
model.save('/content/drive/My Drive/semantic_search_model5')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/24790 [00:00<?, ?it/s]

Iteration:   0%|          | 0/24790 [00:00<?, ?it/s]

## Perform Embedding on All Products and Test Queries

In [None]:
# Generate embeddings for product texts (all)
product_texts = unique_products['cleaned_combined_text'].tolist()
product_embeddings = model.encode(product_texts, batch_size=64, show_progress_bar=True)
unique_products['product_embedding'] = list(product_embeddings)


# Generate embeddings for test queries
query_texts_test = unique_queries_test['cleaned_query_text'].tolist()
query_embeddings_test = model.encode(query_texts_test, batch_size=64, show_progress_bar=True)
unique_queries_test['query_embedding'] = list(query_embeddings_test)


Batches:   0%|          | 0/13809 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_products['product_embedding'] = list(product_embeddings)


Batches:   0%|          | 0/227 [00:00<?, ?it/s]

## Create Vector Index

In [None]:
import faiss

product_embeddings_np = np.vstack(unique_products['product_embedding'].values)

# Initialize FAISS indux
dimension = product_embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(product_embeddings_np)


## Evaluate Search

In [None]:
import numpy as np
import time

# Precompute relevant products
relevant_products = df_task_1.groupby('query_id')['product_id'].apply(list).to_dict()

# Evaluation function
def evaluate_search(index, unique_queries, relevant_products, unique_products):
    start_time = time.time()

    hits_at_1 = hits_at_5 = hits_at_10 = mrr = 0
    num_queries = len(unique_queries)

    query_embeddings = np.vstack(unique_queries['query_embedding'].values)
    query_ids = unique_queries['query_id'].values

    D, I = index.search(query_embeddings, k=10)

    for i in range(num_queries):
        query_id = query_ids[i]
        results = unique_products.iloc[I[i]]
        retrieved_product_ids = results['product_id'].tolist()
        relevant_product_ids = relevant_products.get(query_id, [])

        # Calculate Hits@N
        retrieved_set1 = set(retrieved_product_ids[:1])
        retrieved_set5 = set(retrieved_product_ids[:5])
        retrieved_set10 = set(retrieved_product_ids[:10])

        hits_at_1 += any(pid in retrieved_set1 for pid in relevant_product_ids)
        hits_at_5 += any(pid in retrieved_set5 for pid in relevant_product_ids)
        hits_at_10 += any(pid in retrieved_set10 for pid in relevant_product_ids)

        # Calculate MRR
        for rank, product_id in enumerate(retrieved_product_ids, start=1):
            if product_id in relevant_product_ids:
                mrr += 1 / rank
                break

    hits_at_1 /= num_queries
    hits_at_5 /= num_queries
    hits_at_10 /= num_queries
    mrr /= num_queries

    end_time = time.time()
    total_time = end_time - start_time
    time_per_query = total_time / num_queries

    print(f'Total time: {total_time:.3f} seconds, Time per query: {time_per_query:.3f} seconds')

    return hits_at_1, hits_at_5, hits_at_10, mrr

# Evaluate using the test dataset
hits_at_1, hits_at_5, hits_at_10, mrr = evaluate_search(index, unique_queries_test, relevant_products, unique_products)
print(f'Hits@1: {hits_at_1:.3f}, Hits@5: {hits_at_5:.3f}, Hits@10: {hits_at_10:.3f}, MRR: {mrr:.3f}')


Total time: 149.173 seconds, Time per query: 0.010 seconds
Hits@1: 0.452, Hits@5: 0.663, Hits@10: 0.737, MRR: 0.543
