In [None]:
import pandas as pd
import os
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np

In [None]:
!git clone https://github.com/sarahlawlis/esci-shopping-queries.git

Cloning into 'esci-shopping-queries'...
remote: Enumerating objects: 105, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 105 (delta 33), reused 65 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (105/105), 628.00 KiB | 2.35 MiB/s, done.
Resolving deltas: 100% (33/33), done.
Filtering content: 100% (3/3), 1.08 GiB | 28.68 MiB/s, done.


### 1. Preprocessing/Preparation of Data

In [None]:
# List all files in the data directory
os.listdir('/content/esci-shopping-queries/data')

['shopping_queries_dataset_sources.csv',
 'shopping_queries_dataset_products.parquet',
 'shopping_queries_dataset_examples.parquet']

In [None]:
# Load the examples parquet file
examples_df = pd.read_parquet('/content/esci-shopping-queries/data/shopping_queries_dataset_examples.parquet')

# Load the products parquet file
products_df = pd.read_parquet('/content/esci-shopping-queries/data/shopping_queries_dataset_products.parquet')

# Load the sources CSV file
sources_df = pd.read_csv('/content/esci-shopping-queries/data/shopping_queries_dataset_sources.csv')


In [None]:
# Merge Datasets (poduct_locale and product_id from products matches examples)
examples_products = pd.merge(
    examples_df,
    products_df,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

In [None]:
# Filter to only 'us'
examples_products = examples_products[examples_products['product_locale'] == 'us']

2. Fine Tune Sentence Tranformer without Model Training

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")



In [None]:
# Step 1: Tokenize your data and identify domain-specific vocabulary
def get_domain_specific_vocabulary(texts, tokenizer):
    domain_vocab = set()
    for text in texts:
        tokens = tokenizer.tokenize(text)
        for token in tokens:
            if token.startswith("##"):  # Check for subword tokens
                continue
            if token not in tokenizer.vocab:  # If token is not in the BERT vocab
                domain_vocab.add(token)
    return list(domain_vocab)

In [None]:
# Collect texts from your merged dataframe
texts = examples_products['product_title'].fillna("").tolist()
domain_vocab = get_domain_specific_vocabulary(texts, tokenizer)

KeyboardInterrupt: 

In [None]:
# Step 2: Generate embeddings for the domain-specific vocabulary
def get_embeddings_for_vocab(vocab, tokenizer, model):
    model.eval()
    with torch.no_grad():
        embeddings = {}
        for word in vocab:
            # Tokenize and convert to tensor
            inputs = tokenizer(word, return_tensors="pt")
            outputs = model(**inputs)
            # Use the [CLS] token representation as the embedding
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()
            embeddings[word] = cls_embedding
    return embeddings

domain_embeddings = get_embeddings_for_vocab(domain_vocab, tokenizer, model)

NameError: name 'domain_vocab' is not defined

In [None]:
# Step 3: Use these embeddings to enrich domain knowledge
# Example: Check a few embeddings
for word, embedding in list(domain_embeddings.items())[:5]:
    print(f"Word: {word}\nEmbedding: {embedding[:10]}...")  # Show the first 10 values
