In [1]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoModel, AutoTokenizer 
import os
import pandas as pd
import dask.dataframe as dd
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# Load Data

In [2]:
examples_path = os.path.join('.', 'esci-shopping-queries/data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('.', 'esci-shopping-queries/data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('.', 'esci-shopping-queries/data', 'shopping_queries_dataset_sources.csv')

examples = dd.read_parquet(examples_path)
products = dd.read_parquet(products_path)
sources = dd.read_csv(sources_path)

In [3]:
examples_products = dd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

examples_products = examples_products[examples_products['product_locale'] == 'us']

task_2 = examples_products[examples_products['large_version'] == 1]

# another thing that I changed 
# encoding the esci labels 
label_mapping = {'E': 0, 
                 'S': 1, 
                 'C': 2, 
                 'I': 3}

task_2['encoded_labels'] = task_2['esci_label'].map(label_mapping).astype(int)


task_2_train = task_2[task_2['split'] == 'train']
task_2_test = task_2[task_2['split'] == 'test']

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
model = AutoModel.from_pretrained('distilroberta-base').to(device)

for param in model.parameters():
    param.requires_grad = False

def generate_embeddings(texts):
    batch_size = 16  # Adjust this size
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        # adjusting this for max pooling 
        batch_embeddings, _ = torch.max(outputs.last_hidden_state, dim=1)
        batch_embeddings = batch_embeddings.cpu().numpy()
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

def process_partition(partition):
    query_embeddings = generate_embeddings(partition['query'])
    product_title_embeddings = generate_embeddings(partition['product_title'])

    combined = torch.cat((torch.tensor(query_embeddings), torch.tensor(product_title_embeddings)), dim=1).numpy()
    
    print(f'Combined shape: {combined.shape}')  # Expecting (n, 1536)

    result = pd.DataFrame(combined, index=partition.index, columns=[f'embedding_{i}' for i in range(combined.shape[1])])

    return result



In [6]:
# creating a data frame and enerate column names
meta = pd.DataFrame(columns=[f'embedding_{i}' for i in range(2 * 768)], dtype='float64')

In [7]:
# computes the total number of rows in the df
total_rows = task_2_train.shape[0].compute()

# calculates the fraction of rows needed to sample 10000 
sample_fraction = 10000 / total_rows

# samples a fraction of the df 
task_2_train_sample = task_2_train.sample(frac=sample_fraction, random_state=42)

In [88]:
task_2_train_sample

Unnamed: 0_level_0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,encoded_labels
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,int64,string,int64,string,string,string,int64,int64,string,string,string,string,string,string,int32
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [89]:
result = task_2_train_sample.map_partitions(process_partition, meta=meta)

In [90]:
result = result.compute()

Combined shape: (10000, 1536)


In [91]:
result

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_1526,embedding_1527,embedding_1528,embedding_1529,embedding_1530,embedding_1531,embedding_1532,embedding_1533,embedding_1534,embedding_1535
1322108,0.108705,0.234877,0.139450,0.315206,0.575190,-0.020448,0.148328,0.270037,0.075255,0.128139,...,0.182917,0.098517,0.072425,0.049298,0.214076,0.081951,0.728155,0.395790,0.097055,0.106109
686437,0.128653,0.085390,0.074830,0.089813,0.085738,-0.105572,0.078860,0.257313,0.048456,-0.021896,...,0.269261,0.137719,0.355748,0.081922,0.274871,0.236418,0.301538,0.465826,0.222930,0.247738
2135583,0.111750,0.206983,0.077643,-0.041382,0.817184,0.118343,0.085018,0.270429,0.149541,0.193659,...,0.307988,0.080208,0.200823,0.147451,0.306865,0.176306,0.818601,0.228154,0.178096,0.188407
1566068,0.042544,0.328948,0.044401,0.084752,1.112144,-0.033281,0.045591,0.149777,0.165218,0.013646,...,0.365519,0.049413,0.259212,0.177283,0.178866,0.166594,0.643011,0.622622,0.218156,0.154736
2075274,0.111251,0.145150,0.067874,0.081204,0.890214,-0.038702,0.101804,0.204692,0.066338,-0.024227,...,0.086346,0.180397,0.096318,0.155747,0.262609,0.157427,0.296501,0.315046,0.140594,0.244535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1656352,0.238125,0.185478,0.054519,0.261466,0.728569,-0.099423,0.199211,0.144787,0.068051,0.287547,...,0.617471,0.175846,0.573133,0.403009,0.477426,0.282736,0.492554,0.791985,0.324375,0.293089
603069,0.063194,0.217883,0.141870,0.170465,0.655925,-0.002001,0.123496,0.135748,0.042142,0.233673,...,0.195427,0.117208,0.421813,0.292528,0.437827,0.301911,0.611399,0.698749,0.273277,0.311094
1815523,0.202807,0.227380,0.123965,0.099034,1.005392,0.480051,0.136926,0.050289,0.104759,0.291602,...,0.215847,0.118625,0.004812,0.085508,0.313142,0.163584,0.459325,0.330234,0.162431,0.273381
1732044,0.020294,0.086303,0.073896,0.338815,0.428603,0.239287,-0.016330,0.142109,0.039532,0.023233,...,0.202253,0.121261,0.296222,0.341769,0.307535,0.098423,0.500768,0.408014,0.281700,0.286709


Creating the Multi-Layer Preceptron model

In [93]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [95]:
# building the Multi-Layer Preceptron model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(input_size, hidden_size),  
            nn.Dropout(0.1),  
            nn.Linear(hidden_size, num_classes)  
        )

    def forward(self, x):
        return self.seq(x)

In [96]:
# inputting the parameters

# the size of the concatenated embeddings(768 + 768)
input_size = 1536  
hidden_size = 128
# number of classes Exact, Substitute, Complement, Irrelevant
num_classes = 4 

# initialize the model, loss, and optimizer
model = MLP(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()

# for the training hyperparameter configuration 
# set the 4 epochs and Adam optimizer with values 
# epsilon (1e-8), learning rate (5e-5) and weight decay (0.01)
optimizer = optim.Adam(model.parameters(), lr=5e-5, eps=1e-8, weight_decay=0.01)


creating the data loader (train/test loader) to pass through the model 

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings_df, labels_df, embedding_columns):
        self.embeddings_df = embeddings_df
        self.labels_df = labels_df
        self.embedding_columns = embedding_columns
    
    def __len__(self):
        return len(self.labels_df)
    
    def __getitem__(self, idx):
        # Get the embedding row as a numpy array
        embedding = self.embeddings_df.loc[idx, self.embedding_columns].values
        
        # Get the corresponding label
        label = self.labels_df.loc[idx, 'encoded_labels']
        
        # Convert embedding to torch tensor
        embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.long)
        
        return embedding_tensor, label_tensor

In [None]:
# Prepping the data
embeddings = []
labels = []

# Iterate through the result DataFrame which contains embeddings
for i, row in result.iterrows():
    # Get the embedding from the current row
    embedding = row[embedding_columns].values  
    embeddings.append(embedding)
    
    # Get the label by using the 'index' column instead of idx
    index_value = row['index']  # Access the value in the 'index' column
    label = task_2_train_sample.loc[task_2_train_sample['index'] == index_value, 'encoded_labels'].values[0]
    labels.append(label)

# Convert lists to tensors
embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
labels_tensor = torch.tensor(labels, dtype=torch.long)


In [None]:
# Create Dataset instances for train and test sets
train_dataset = EmbeddingDataset(result, task_2_train, embedding_columns)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [37]:
# training loop
# set the 4 epochs as defined in the paper 
def train_model(train_loader, model, criterion, optimizer, epochs=4):
    model.train()
    for epoch in range(epochs):
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [38]:
# evaluation and output the f1 score 
def evaluate_model(test_loader, model):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
    # evaluate on the f1 score with micro averages
    return f1_score(all_labels, all_preds, average='micro')

getting preliminalry results 

In [None]:
# Train the model
train_model(train_loader, model, criterion, optimizer)

In [None]:
# Evaluate the model
f1 = evaluate_model(test_loader, model)
print(f'Micro F1 Score: {f1:.4f}')