## Reference Videos

In [None]:
# https://www.youtube.com/watch?v=o-pZk5R0TZg

## Imports

In [None]:
# Imports
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

## Import the data, as well as split into a smaller/larger dataset 

In [None]:
# Source code: https://github.com/amazon-science/esci-data/blob/main/README.md

df_examples = pd.read_parquet('shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('shopping_queries_dataset_products.parquet')
df_sources = pd.read_csv("shopping_queries_dataset_sources.csv")

df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

# Smaller dataset: filter to U.S. products
df_task_1 = df[
    (df["small_version"] == 1) &
    (df["product_locale"] == "us") &
    (df["product_title"].notnull())
]

# Task 2: larger dataset
df_task_2 = df_examples_products[df_examples_products["large_version"] == 1]
df_task_2_train = df_task_2[df_task_2["split"] == "train"]
df_task_2_test = df_task_2[df_task_2["split"] == "test"]

## Initial Model: Smaller sample set, only looking at exact matches

In [None]:
df_task_1 = df_task_1[
    (df_task_1["split"] == "train")
]
df_task_1 = df_task_1[df_task_1['esci_label'] == 'E']

sample_df = df_task_1.copy()
queries = sample_df['query'].values
products = sample_df['product_title'].values

# Vectorize Queries and Products
vectorizer = TfidfVectorizer()
query_vecs = vectorizer.fit_transform(queries)
product_vecs = vectorizer.transform(products)  # Use same vocab

# Score & Recommend 
for i, query in enumerate(queries):
    scores = cosine_similarity(query_vecs[i], product_vecs).flatten()
    ranked_indices = scores.argsort()[::-1]
    print(f"\nTop matches for: '{query}'")
    for idx in ranked_indices[:3]:
        print(f"- {products[idx]} (score: {scores[idx]:.4f})")

## Incorporate Sentence Transformers

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# Smaller Sample + Combine Product Metadata
sample_df = df_task_1.sample(n=20, random_state=42).copy()
sample_df['full_product_text'] = (
    sample_df['product_title'].fillna('') + ' ' +
    sample_df['product_description'].fillna('') + ' ' +
    sample_df['product_brand'].fillna('')  + ' ' + train_df['product_bullet_point'].fillna('')
)

queries = sample_df['query'].values
products = sample_df['full_product_text'].values

# Use SentenceTransformer for Semantic Embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

query_vecs = model.encode(queries, convert_to_tensor=True)
product_vecs = model.encode(products, convert_to_tensor=True)

# Score, Normalize, Recommend
for i in range(len(queries)):
    scores = cosine_similarity(query_vecs[i].cpu().numpy().reshape(1, -1),
                               product_vecs.cpu().numpy()).flatten()
    # Normalize scores between 0 and 1
    scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)

    ranked_indices = scores.argsort()[::-1]
    print(f"\nTop matches for: '{queries[i]}'")
    for idx in ranked_indices[:3]:
        print(f"- {sample_df.iloc[idx]['product_title']} (score: {scores[idx]:.4f})")


## Incorporate training and test

In [None]:
train_df = df_task_1_train.sample(n=30, random_state=42).copy()
train_df['full_product_text'] = (
    train_df['product_title'].fillna('') + ' ' +
    train_df['product_description'].fillna('') + ' ' +
    train_df['product_brand'].fillna('') + ' ' + train_df['product_bullet_point'].fillna('')
)
train_products = train_df['full_product_text'].values
train_titles = train_df['product_title'].values  # for display

# Prepare Test Queries 
test_df = df_task_1_test.sample(n=5, random_state=24).copy()
test_queries = test_df['query'].values

# Embeddings 
model = SentenceTransformer('all-MiniLM-L6-v2')
product_vecs = model.encode(train_products, convert_to_tensor=True)
query_vecs = model.encode(test_queries, convert_to_tensor=True)

# Recommend Products for Each Query
for i, query in enumerate(test_queries):
    scores = cosine_similarity(
        query_vecs[i].cpu().numpy().reshape(1, -1),
        product_vecs.cpu().numpy()
    ).flatten()

    # Normalize scores between 0 and 1
    scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
    ranked_indices = scores.argsort()[::-1]

    print(f"\nTop matches for test query: '{query}'")
    for idx in ranked_indices[:3]:
        print(f"- {train_titles[idx]} (score: {scores[idx]:.4f})")

## Trying to use Recall and MRR (This doesn't work)

In [None]:
# === Filter for US, Exact Matches, Titles Present ===
df_task_1 = df[
    (df["small_version"] == 1) &
    (df["product_locale"] == "us") &
    (df["product_title"].notnull()) &
    (df["esci_label"] == "E")
]

# === Full Train and Test Sets ===
df_train = df_task_1[df_task_1["split"] == "train"].copy()
df_test = df_task_1[df_task_1["split"] == "test"].sample(n=20, random_state=24).copy()

# === Build full product metadata ===
df_train['full_product_text'] = (
    df_train['product_title'].fillna('') + ' ' +
    df_train['product_description'].fillna('') + ' ' +
    df_train['product_brand'].fillna('')
)

# === SentenceTransformer Embeddings ===
model = SentenceTransformer('all-MiniLM-L6-v2')
product_vecs = model.encode(df_train['full_product_text'].values, convert_to_tensor=True)
product_ids = df_train['product_id'].values

query_vecs = model.encode(df_test['query'].values, convert_to_tensor=True)
true_ids = df_test['product_id'].values

# === Evaluate Recall@3 and MRR@3 ===
k = 3
recall_hits = 0
reciprocal_ranks = []

for i in range(len(df_test)):
    scores = cosine_similarity(
        query_vecs[i].cpu().numpy().reshape(1, -1),
        product_vecs.cpu().numpy()
    ).flatten()

    ranked_indices = scores.argsort()[::-1][:k]
    retrieved_ids = product_ids[ranked_indices]

    print(f"\n🔍 Query: {df_test.iloc[i]['query']}")
    print("Top retrieved product IDs:", list(retrieved_ids))
    print("True product ID:", true_ids[i])

    if true_ids[i] in retrieved_ids:
        recall_hits += 1
        rank = list(retrieved_ids).index(true_ids[i]) + 1
        reciprocal_ranks.append(1.0 / rank)
    else:
        reciprocal_ranks.append(0.0)

recall_at_k = recall_hits / len(df_test)
mrr_at_k = np.mean(reciprocal_ranks)

print(f"\n✨ Evaluation Results (Full Train Set):")
print(f"Recall@{k}: {recall_at_k:.4f}")
print(f"MRR@{k}:    {mrr_at_k:.4f}")