In [1]:
%pip install pandas torch transformers scikit-learn numpy "dask[dataframe]"

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import os
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np
import dask.dataframe as dd

In [2]:
# Cloning the Git Repository

In [4]:
!git clone https://github.com/sarahlawlis/esci-shopping-queries.git

fatal: destination path 'esci-shopping-queries' already exists and is not an empty directory.


### 1. Preprocessing/Preparation of Data

In [None]:
# Load Data and Create DataFrames

In [None]:
examples_path = os.path.join('..', 'Practicum_Code', 'esci-shopping-queries', 'data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('..', 'Practicum_Code', 'esci-shopping-queries', 'data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('..', 'Practicum_Code', 'esci-shopping-queries', 'data', 'shopping_queries_dataset_sources.csv')

# Load the data with Dask
examples = dd.read_parquet(examples_path)
products = dd.read_parquet(products_path)
sources = dd.read_csv(sources_path)


In [None]:
# Merge the examples and products datasets
examples_products = dd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale', 'product_id'],
    right_on=['product_locale', 'product_id']
)

# Filter to only 'us' locale
examples_products = examples_products[examples_products['product_locale'] == 'us']

# Filter for large versions (task_2)
task_2 = examples_products[examples_products['large_version'] == 1]

# Define label mapping
label_mapping = {'E': 0, 'S': 1, 'C': 2, 'I': 3}

# Map labels to integers using map_partitions with meta to specify output type
task_2['encoded_labels'] = task_2['esci_label'].map_partitions(
    lambda df: df.map(label_mapping),
    meta=('encoded_labels', 'int32')
)

# Split the data into training and testing sets
task_2_train = task_2[task_2['split'] == 'train']
task_2_test = task_2[task_2['split'] == 'test']

# For further computation or saving as a Pandas DataFrame, use .compute()
task_2_train = task_2_train.compute()
task_2_test = task_2_test.compute()


In [None]:
# Filter to only 'us'
examples_products = examples_products[examples_products['product_locale'] == 'us']

2. Fine Tune Sentence Tranformer without Model Training

In [None]:
# Load DistilBERT tokenizer and model
tokenizer_bert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_bert = AutoModel.from_pretrained("distilbert-base-uncased")

# Load DistilRoBERTa tokenizer and model
tokenizer_roberta = AutoTokenizer.from_pretrained("distilroberta-base")
model_roberta = AutoModel.from_pretrained("distilroberta-base")

In [None]:
# Step 1: Tokenize your data and identify domain-specific vocabulary
def get_domain_specific_vocabulary(texts, tokenizer_bert, tokenizer_roberta):
    domain_vocab_bert = set()
    domain_vocab_roberta = set()
    
    for text in texts:
        # Tokenize with DistilBERT tokenizer
        tokens_bert = tokenizer_bert.tokenize(text)
        for token in tokens_bert:
            if token.startswith("##"):  # Check for subword tokens
                continue
            if token not in tokenizer_bert.vocab:  # If token is not in the DistilBERT vocab
                domain_vocab_bert.add(token)
        
        # Tokenize with DistilRoBERTa tokenizer
        tokens_roberta = tokenizer_roberta.tokenize(text)
        for token in tokens_roberta:
            if token.startswith("Ġ"):  # Check for RoBERTa subword prefix
                continue
            if token not in tokenizer_roberta.vocab:  # If token is not in the DistilRoBERTa vocab
                domain_vocab_roberta.add(token)
    
    # Find mismatches unique to each model and common mismatches
    unique_to_bert = domain_vocab_bert - domain_vocab_roberta
    unique_to_roberta = domain_vocab_roberta - domain_vocab_bert
    common_mismatches = domain_vocab_bert & domain_vocab_roberta
    
    return {
        "unique_to_bert": list(unique_to_bert),
        "unique_to_roberta": list(unique_to_roberta),
        "common_mismatches": list(common_mismatches)
    }


In [None]:
# Compute the examples_products DataFrame to perform Pandas operations
examples_products_pd = examples_products.compute()

# Collect texts from your merged dataframe
texts = examples_products_pd['product_title'].fillna("").tolist()

# Identify domain-specific vocabulary mismatches for both tokenizers
domain_vocab_mismatches = get_domain_specific_vocabulary(texts, tokenizer_bert, tokenizer_roberta)

# Inspect mismatched vocabulary
print("Unique to DistilBERT:", domain_vocab_mismatches["unique_to_bert"][:10])
print("Unique to DistilRoBERTa:", domain_vocab_mismatches["unique_to_roberta"][:10])
print("Common mismatches:", domain_vocab_mismatches["common_mismatches"][:10])


In [None]:
def get_embeddings_for_vocab(vocab, tokenizer, model):
    model.eval()
    embeddings = {}
    with torch.no_grad():
        for word in vocab:
            # Tokenize and convert to tensor
            inputs = tokenizer(word, return_tensors="pt")
            outputs = model(**inputs)
            # Use the [CLS] token representation as the embedding
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()
            embeddings[word] = cls_embedding
    return embeddings

# Generate embeddings for unique and common mismatches with both models
domain_vocab_mismatches = get_domain_specific_vocabulary(texts, tokenizer_bert, tokenizer_roberta)

# Embeddings for terms unique to DistilBERT
bert_embeddings = get_embeddings_for_vocab(domain_vocab_mismatches["unique_to_bert"], tokenizer_bert, model_bert)

# Embeddings for terms unique to DistilRoBERTa
roberta_embeddings = get_embeddings_for_vocab(domain_vocab_mismatches["unique_to_roberta"], tokenizer_roberta, model_roberta)

# Embeddings for common mismatches (both models)
common_embeddings_bert = get_embeddings_for_vocab(domain_vocab_mismatches["common_mismatches"], tokenizer_bert, model_bert)
common_embeddings_roberta = get_embeddings_for_vocab(domain_vocab_mismatches["common_mismatches"], tokenizer_roberta, model_roberta)

In [None]:
# Step 3: Use these embeddings to enrich domain knowledge

# Example: Check a few embeddings unique to DistilBERT
print("Embeddings unique to DistilBERT:")
for word, embedding in list(bert_embeddings.items())[:5]:
    print(f"Word: {word}\nEmbedding: {embedding[:10]}...")  # Show the first 10 values

# Example: Check a few embeddings unique to DistilRoBERTa
print("\nEmbeddings unique to DistilRoBERTa:")
for word, embedding in list(roberta_embeddings.items())[:5]:
    print(f"Word: {word}\nEmbedding: {embedding[:10]}...")  # Show the first 10 values

# Example: Check a few embeddings for common mismatches in DistilBERT
print("\nCommon mismatches embeddings from DistilBERT:")
for word, embedding in list(common_embeddings_bert.items())[:5]:
    print(f"Word: {word}\nEmbedding: {embedding[:10]}...")  # Show the first 10 values

# Example: Check a few embeddings for common mismatches in DistilRoBERTa
print("\nCommon mismatches embeddings from DistilRoBERTa:")
for word, embedding in list(common_embeddings_roberta.items())[:5]:
    print(f"Word: {word}\nEmbedding: {embedding[:10]}...")  # Show the first 10 values

