In [1]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoModel, AutoTokenizer 
import os
import pandas as pd
import dask.dataframe as dd
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# Load Data

In [31]:
examples_path = os.path.join('.', 'esci-shopping-queries/data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('.', 'esci-shopping-queries/data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('.', 'esci-shopping-queries/data', 'shopping_queries_dataset_sources.csv')

examples = dd.read_parquet(examples_path)
products = dd.read_parquet(products_path)
sources = dd.read_csv(sources_path)

In [32]:
examples_products = dd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

examples_products = examples_products[examples_products['product_locale'] == 'us']

task_2 = examples_products[examples_products['large_version'] == 1]

# another thing that I changed 
# encoding the esci labels 
label_mapping = {'E': 0, 
                 'S': 1, 
                 'C': 2, 
                 'I': 3}

task_2['encoded_labels'] = task_2['esci_label'].map(label_mapping).astype(int)


task_2_train = task_2[task_2['split'] == 'train']
task_2_test = task_2[task_2['split'] == 'test']

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [34]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
model = AutoModel.from_pretrained('distilroberta-base').to(device)

for param in model.parameters():
    param.requires_grad = False

def generate_embeddings(texts):
    batch_size = 16  # Adjust this size
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        # adjusting this for max pooling 
        batch_embeddings, _ = torch.max(outputs.last_hidden_state, dim=1)
        batch_embeddings = batch_embeddings.cpu().numpy()
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

def process_partition(partition):
    query_embeddings = generate_embeddings(partition['query'])
    product_title_embeddings = generate_embeddings(partition['product_title'])

    combined = torch.cat((torch.tensor(query_embeddings), torch.tensor(product_title_embeddings)), dim=1).numpy()
    
    print(f'Combined shape: {combined.shape}')  # Expecting (n, 1536)

    result = pd.DataFrame(combined, index=partition.index, columns=[f'embedding_{i}' for i in range(combined.shape[1])])

    return result



In [35]:
# creating a data frame and enerate column names
meta = pd.DataFrame(columns=[f'embedding_{i}' for i in range(2 * 768)], dtype='float64')

In [36]:
# computes the total number of rows in the df
total_rows = task_2_train.shape[0].compute()

# calculates the fraction of rows needed to sample 10000 
sample_fraction = 10000 / total_rows

# samples a fraction of the df 
task_2_train_sample = task_2_train.sample(frac=sample_fraction, random_state=42)

In [37]:
# replicating the same as above but with the test data 
# computes the total number of rows in the df
total_rows2 = task_2_test.shape[0].compute()

# calculates the fraction of rows needed to sample 32719
sample_fraction2 = 10000 / total_rows2

# samples a fraction of the df 
task_2_test_sample = task_2_test.sample(frac=sample_fraction2, random_state=42)

In [9]:
task_2_train_sample

Unnamed: 0_level_0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,encoded_labels
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,int64,string,int64,string,string,string,int64,int64,string,string,string,string,string,string,int32
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [38]:
result = task_2_train_sample.map_partitions(process_partition, meta=meta)

In [39]:
result = result.compute()

Combined shape: (10000, 1536)


In [41]:
result2 = task_2_test_sample.map_partitions(process_partition, meta=meta)

result2 = result2.compute()

Combined shape: (10000, 1536)


In [42]:
result

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_1526,embedding_1527,embedding_1528,embedding_1529,embedding_1530,embedding_1531,embedding_1532,embedding_1533,embedding_1534,embedding_1535
1322108,0.108705,0.234878,0.139451,0.315206,0.575189,-0.020448,0.148328,0.270037,0.075255,0.128139,...,0.182917,0.098517,0.072425,0.049298,0.214076,0.081951,0.728155,0.395789,0.097055,0.106109
686437,0.128653,0.085391,0.074830,0.089813,0.085738,-0.105572,0.078860,0.257313,0.048456,-0.021896,...,0.269260,0.137719,0.355748,0.081922,0.274871,0.236418,0.301538,0.465827,0.222930,0.247738
2135583,0.111750,0.206983,0.077644,-0.041382,0.817182,0.118343,0.085018,0.270429,0.149541,0.193659,...,0.307988,0.080208,0.200824,0.147451,0.306865,0.176306,0.818602,0.228154,0.178096,0.188407
1566068,0.042544,0.328947,0.044401,0.084752,1.112144,-0.033281,0.045591,0.149777,0.165219,0.013646,...,0.365519,0.049413,0.259212,0.177284,0.178866,0.166594,0.643010,0.622623,0.218156,0.154736
2075274,0.111251,0.145150,0.067874,0.081204,0.890214,-0.038702,0.101804,0.204691,0.066338,-0.024227,...,0.086346,0.180397,0.096318,0.155746,0.262609,0.157427,0.296502,0.315046,0.140594,0.244536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1656352,0.238125,0.185477,0.054519,0.261465,0.728569,-0.099423,0.199211,0.144787,0.068052,0.287547,...,0.617471,0.175846,0.573132,0.403009,0.477425,0.282736,0.492554,0.791985,0.324376,0.293089
603069,0.063194,0.217883,0.141870,0.170465,0.655924,-0.002001,0.123496,0.135748,0.042142,0.233673,...,0.195427,0.117208,0.421813,0.292528,0.437827,0.301911,0.611398,0.698750,0.273276,0.311094
1815523,0.202807,0.227380,0.123965,0.099034,1.005392,0.480051,0.136926,0.050289,0.104759,0.291602,...,0.215847,0.118625,0.004812,0.085508,0.313142,0.163584,0.459325,0.330234,0.162432,0.273381
1732044,0.020295,0.086303,0.073896,0.338815,0.428603,0.239288,-0.016330,0.142109,0.039531,0.023234,...,0.202253,0.121261,0.296222,0.341769,0.307535,0.098422,0.500768,0.408014,0.281700,0.286708


In [43]:
# creating a subset that only has the indicies and the coresponding label 
task_2_train = task_2_train.compute()
task_2_test = task_2_test.compute()

type(task_2_train)
type(task_2_test)

pandas.core.frame.DataFrame

In [44]:
type(result)

pandas.core.frame.DataFrame

In [45]:
type(result2)

pandas.core.frame.DataFrame

Creating the Multi-Layer Preceptron model

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [46]:
# building the Multi-Layer Preceptron model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(input_size, hidden_size),  
            nn.Dropout(0.1),  
            nn.Linear(hidden_size, num_classes)  
        )

    def forward(self, x):
        return self.seq(x)

In [47]:
# inputting the parameters

# the size of the concatenated embeddings(768 + 768)
input_size = 1536  
hidden_size = 128
# number of classes Exact, Substitute, Complement, Irrelevant
num_classes = 4 

# initialize the model, loss, and optimizer
model = MLP(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()

# for the training hyperparameter configuration 
# set the 4 epochs and Adam optimizer with values 
# epsilon (1e-8), learning rate (5e-5) and weight decay (0.01)
optimizer = optim.Adam(model.parameters(), lr=5e-5, eps=1e-8, weight_decay=0.01)


creating the data loader (train/test loader) to pass through the model 

In [48]:
# creating the data loader with the accurate indexes 

# making sure the indicies are in the same data format 
subset_indices = result.index
subset_indices = subset_indices.astype(int)
task_2_train_indices = task_2_train.index.astype(int)

# getting the indexes that were used in the sample group of embeddings
# maing sure they are also in the og training set 
valid_indices = task_2_train_indices[task_2_train_indices.isin(subset_indices)]

# making subset labels which filters the train dataset to get the labels the correspond to the embeddings 
subset_labels = task_2_train.loc[valid_indices, 'encoded_labels'] 
# make it into a dataframe
subset_labels = subset_labels.to_frame()

In [49]:
# doing the same with the test 
# creating the data loader with the accurate indexes 

# making sure the indicies are in the same data format 
subset_indices2 = result2.index
subset_indices2 = subset_indices2.astype(int)
task_2_test_indices = task_2_test.index.astype(int)

# getting the indexes that were used in the sample group of embeddings
# maing sure they are also in the og training set 
valid_indices2 = task_2_test_indices[task_2_test_indices.isin(subset_indices2)]

# making subset labels which filters the train dataset to get the labels the correspond to the embeddings 
subset_labels2 = task_2_test.loc[valid_indices2, 'encoded_labels'] 
# make it into a dataframe
subset_labels2 = subset_labels2.to_frame()

In [50]:
result = result.sort_index()

In [51]:
result2 = result2.sort_index()

In [52]:
print(subset_labels)

         encoded_labels
196                   0
287                   0
795                   0
835                   1
1448                  3
...                 ...
2260438               2
2260686               3
2261045               0
2261125               0
2567053               3

[10000 rows x 1 columns]


In [53]:
print(subset_labels2)

         encoded_labels
410                   0
429                   3
560                   0
580                   0
581                   1
...                 ...
2258965               0
2259219               0
2260170               0
2260571               0
2533306               0

[10000 rows x 1 columns]


In [54]:
print(subset_labels['encoded_labels'].iloc[196])

0


In [55]:
print(subset_labels2['encoded_labels'].iloc[410])

1


In [57]:
print(type(subset_labels))

<class 'pandas.core.frame.DataFrame'>


In [58]:
class ESCIDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings.values
        # this should be (size, 1563)
        print("Shape of embeddings:", self.embeddings.shape)
        self.labels = labels   

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# create DataLoader
train_dataset = ESCIDataset(embeddings=result, labels=subset_labels['encoded_labels'].values)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # adjust the batch size as needed 

Shape of embeddings: (10000, 1536)


In [59]:
# making the test loader 
test_dataset = ESCIDataset(embeddings=result2, labels=subset_labels2['encoded_labels'].values)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) 

Shape of embeddings: (10000, 1536)


In [60]:
print("Length of embeddings:", len(train_dataset.embeddings))
print("Length of labels:", len(train_dataset.labels))

Length of embeddings: 10000
Length of labels: 10000


In [61]:
# these should both be 'numpy.ndarray' or theres a problem 
print("Type of embeddings:", type(train_dataset.embeddings))
print("Type of labels:", type(train_dataset.labels))

Type of embeddings: <class 'numpy.ndarray'>
Type of labels: <class 'numpy.ndarray'>


In [177]:
# look at the samples to double check everythign is looking right 
for i in range(5):  
    embedding, label = train_dataset[i]
    print(f"sample {i} - embedding: {embedding}, label: {label}")

Sample 0 - Embedding: [0.17990594 0.10993975 0.12278097 ... 0.43050417 0.19597918 0.21272719], Label: 0
Sample 1 - Embedding: [0.10828137 0.07075226 0.07835397 ... 0.52852386 0.23825216 0.30547935], Label: 0
Sample 2 - Embedding: [0.21993501 0.3172615  0.04313886 ... 0.7421779  0.23599522 0.31108263], Label: 0
Sample 3 - Embedding: [0.10928485 0.3230517  0.13458526 ... 0.43360886 0.22882652 0.26526317], Label: 1
Sample 4 - Embedding: [0.11612305 0.11765453 0.06042268 ... 0.5264007  0.28494015 0.25032914], Label: 3


In [178]:
# making sure the batch sizes look correct 
for embeddings, labels in train_loader:
    print(f"Batch shape: {embeddings.shape}")  
    print(f"Labels shape: {labels.shape}")  

Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([32])
Batch shape: torch.Size([32, 1536])
Labels shape: torch.Size([

In [62]:
# training loop
# set the 4 epochs as defined in the paper 
def train_model(model, train_loader, criterion, optimizer, num_epochs=4):
    model.train()  # set model to training mode
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch_idx, (embeddings, labels) in enumerate(train_loader):
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()  # Clear previous gradients
            outputs = model(embeddings.float())  # Forward pass
            # converting the labels to long in order to 
            labels = labels.long()
            # calculate the loss 
            loss = criterion(outputs, labels) 
            # backpropogation 
            loss.backward() 
            # updating the weights 
            optimizer.step()  

            # add up the loss 
            epoch_loss += loss.item()  

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_loader):.4f}")

# run the training model with the 10000 samples 
train_model(model, train_loader, criterion, optimizer)


Epoch 1/4, Loss: 0.8834
Epoch 2/4, Loss: 0.8547
Epoch 3/4, Loss: 0.8471
Epoch 4/4, Loss: 0.8434


In [63]:
# evaluation and output the f1 score 
def evaluate_model(test_loader, model):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
            
    # evaluate on the f1 score with micro averages
    return f1_score(all_labels, all_preds, average='micro')

getting preliminalry results 

In [64]:
# Evaluate the model
f1 = evaluate_model(test_loader, model)
print(f'Micro F1 Score: {f1:.4f}')

Micro F1 Score: 0.6519
