In [1]:
import pandas as pd
import os
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
!git clone https://github.com/sarahlawlis/esci-shopping-queries.git

Cloning into 'esci-shopping-queries'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 39 (delta 7), reused 30 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (39/39), 6.50 KiB | 6.50 MiB/s, done.
Resolving deltas: 100% (7/7), done.
Filtering content: 100% (3/3), 1.08 GiB | 25.96 MiB/s, done.


### 1. Preprocessing/Preparation of Data

In [3]:
# List all files in the data directory
os.listdir('/content/esci-shopping-queries/data')

['shopping_queries_dataset_products.parquet',
 'shopping_queries_dataset_sources.csv',
 'shopping_queries_dataset_examples.parquet']

In [4]:
# Load the examples parquet file
examples_df = pd.read_parquet('/content/esci-shopping-queries/data/shopping_queries_dataset_examples.parquet')

# Load the products parquet file
products_df = pd.read_parquet('/content/esci-shopping-queries/data/shopping_queries_dataset_products.parquet')

# Load the sources CSV file
sources_df = pd.read_csv('/content/esci-shopping-queries/data/shopping_queries_dataset_sources.csv')


In [5]:
# Merge Datasets (poduct_locale and product_id from products matches examples)
examples_products = pd.merge(
    examples_df,
    products_df,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

In [6]:
# Filter to only 'us'
examples_products = examples_products[examples_products['product_locale'] == 'us']

In [7]:
# Split training and testing parts of data
task_2 = examples_products[examples_products['large_version'] == 1]
task_2_train = task_2[task_2['split'] == 'train']
task_2_test = task_2[task_2['split'] == 'test']

### 2. Modeling Frozen Base Model with Fine-Tuning of Vectors

In [8]:
# Choose a pre-trained BERT-like model
model_name = "distilbert-base-uncased"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [9]:
# Freeze all layers in the model
for param in model.parameters():
    param.requires_grad = False

In [10]:
# Get Embeddings for Query and Product Title
def get_embeddings(text, tokenizer, model):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Pass through the model to get hidden states (embeddings)
    with torch.no_grad():  # Since we're not training the model, we don't need gradients
        outputs = model(**inputs)

    # The hidden states (embeddings) are in the `last_hidden_state`
    embeddings = outputs.last_hidden_state  # (batch_size, sequence_length, hidden_size)

    # You can pool (average) across the sequence length to get a fixed-size embedding
    pooled_embeddings = torch.mean(embeddings, dim=1)  # (batch_size, hidden_size)

    return pooled_embeddings


In [11]:
# Example input texts
query = "wireless headphones"
product_title = "Bluetooth wireless noise-canceling headphones"

# Get embeddings for the query and product title
query_embedding = get_embeddings(query, tokenizer, model)
product_embedding = get_embeddings(product_title, tokenizer, model)

# Concatenate the embeddings
combined_embedding = torch.cat((query_embedding, product_embedding), dim=1)  # Concatenate along the feature dimension


In [13]:
import torch.nn as nn

# Define the classifier
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # Fully connected layer
        self.dropout = nn.Dropout(0.1)  # 10% dropout for regularization
        self.fc2 = nn.Linear(hidden_size, num_classes)  # Classification layer (output)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Initialize the classifier
input_size = query_embedding.size(1) + product_embedding.size(1)  # Size of the concatenated embeddings
hidden_size = 128  # architecture diagram
num_classes = 2  # Binary classification

classifier = Classifier(input_size, hidden_size, num_classes)

# Example forward pass
output = classifier(combined_embedding)
print(output)


tensor([[-0.0755, -0.0751]], grad_fn=<AddmmBackward0>)


In [14]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)

# Example training loop (you'd typically loop over batches of your dataset)
num_epochs = 10
for epoch in range(num_epochs):
    classifier.train()

    # Forward pass: Get outputs
    outputs = classifier(combined_embedding)

    # Dummy target (1 if matching, 0 if not matching)
    target = torch.tensor([1])  # Replace with your actual labels

    # Compute the loss
    loss = criterion(outputs, target)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/10], Loss: 0.7186
Epoch [2/10], Loss: 0.3526
Epoch [3/10], Loss: 0.1715
Epoch [4/10], Loss: 0.0979
Epoch [5/10], Loss: 0.0356
Epoch [6/10], Loss: 0.0278
Epoch [7/10], Loss: 0.0145
Epoch [8/10], Loss: 0.0052
Epoch [9/10], Loss: 0.0060
Epoch [10/10], Loss: 0.0022
