In [15]:
import sys

In [16]:
import torch

In [17]:
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoModel, AutoTokenizer 

In [18]:
import os

In [19]:
import pandas as pd

In [20]:
import dask.dataframe as dd

In [21]:
import numpy as np

In [22]:
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# Load Data

In [23]:
examples_path = os.path.join('.', 'data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('.', 'data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('.', 'data', 'shopping_queries_dataset_sources.csv')

examples = dd.read_parquet(examples_path)
products = dd.read_parquet(products_path)
sources = dd.read_csv(sources_path)

In [24]:
examples_products = dd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

examples_products = examples_products[examples_products['product_locale'] == 'us']

task_2 = examples_products[examples_products['large_version'] == 1]

# another thing that I changed 
# encoding the esci labels 
label_mapping = {'E': 0, 
                 'S': 1, 
                 'C': 2, 
                 'I': 3}

task_2['encoded_labels'] = task_2['esci_label'].map(label_mapping).astype(int)


task_2_train = task_2[task_2['split'] == 'train']
task_2_test = task_2[task_2['split'] == 'test']

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('esci_label', 'float64'))



In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
model = AutoModel.from_pretrained('distilroberta-base').to(device)

for param in model.parameters():
    param.requires_grad = False

def generate_embeddings(texts):
    batch_size = 128  # adjust this size
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

def process_partition(partition):
    query_embeddings = generate_embeddings(partition['query'])
    product_title_embeddings = generate_embeddings(partition['product_title'])

    combined = torch.cat((torch.tensor(query_embeddings), torch.tensor(product_title_embeddings)), dim=1).numpy()
    
    print(f'Combined shape: {combined.shape}')  # Expecting (n, 1536)

    result = pd.DataFrame(combined, index=partition.index, columns=[f'embedding_{i}' for i in range(combined.shape[1])])

    return result

In [27]:
# creating a data frame and enerate column names
meta = pd.DataFrame(columns=[f'embedding_{i}' for i in range(2 * 768)], dtype='float64')

# Sampling the Data (Ignore when looking at full set) 

In [14]:
# computes the total number of rows in the df
total_rows = task_2_train.shape[0].compute()

# calculates the fraction of rows needed to sample 10000 
sample_fraction = 10000 / total_rows

# samples a fraction of the df 
task_2_train_sample = task_2_train.sample(frac=sample_fraction, random_state=42)

In [16]:
# replicating the same as above but with the test data 
# computes the total number of rows in the df
total_rows2 = task_2_test.shape[0].compute()

# calculates the fraction of rows needed to sample 
sample_fraction2 = 10000 / total_rows2

# samples a fraction of the df 
task_2_test_sample = task_2_test.sample(frac=sample_fraction2, random_state=42)

# Computing the Embeddings (Ignore after reading to CSV/NUMPY) 

In [21]:
# Computing the Embeddings for Train Data 
result = task_2_train.map_partitions(process_partition, meta=meta)
result = result.compute()

In [23]:
# Read it to a CSV 
result.to_csv('result_train_distilroberta.csv')

In [24]:
# Read it to a NUMPY 
result_array = result.to_numpy()
np.save('result_train_distilroberta.npy', result_array)

In [16]:
# Computing the Embeddings for the Test Data
result2 = task_2_test.map_partitions(process_partition, meta=meta)
result2 = result2.compute()

In [18]:
# Read it to a CSV 
result2.to_csv('result_test_distilroberta.csv')

In [19]:
# Read it to a NUMPY 
result2_array = result2.to_numpy()
np.save('result_test_distilroberta.npy', result2_array)

# Importing Embeddings and Checking Structure 

In [42]:
# Read in the pre-saved data 
# This will take a little bit 
result_train = pd.read_csv('result_train_distilroberta.csv', index_col=0)
result_test = pd.read_csv('result_test_distilroberta.csv', index_col=0)

In [51]:
print(result_train.head())

   embedding_0  embedding_1  embedding_2  embedding_3  embedding_4  \
0     0.012271     0.062899     -0.00294    -0.142622      0.07132   
1     0.012271     0.062899     -0.00294    -0.142622      0.07132   
2     0.012271     0.062899     -0.00294    -0.142622      0.07132   
3     0.012271     0.062899     -0.00294    -0.142622      0.07132   
4     0.012271     0.062899     -0.00294    -0.142622      0.07132   

   embedding_5  embedding_6  embedding_7  embedding_8  embedding_9  ...  \
0    -0.115269    -0.043993     0.063193     0.041939     -0.03229  ...   
1    -0.115269    -0.043993     0.063193     0.041939     -0.03229  ...   
2    -0.115269    -0.043993     0.063193     0.041939     -0.03229  ...   
3    -0.115269    -0.043993     0.063193     0.041939     -0.03229  ...   
4    -0.115269    -0.043993     0.063193     0.041939     -0.03229  ...   

   embedding_1526  embedding_1527  embedding_1528  embedding_1529  \
0        0.049869       -0.007802       -0.047120       -0.

In [52]:
print(result_test.head())

    embedding_0  embedding_1  embedding_2  embedding_3  embedding_4  \
32    -0.022568     0.055794     0.006754    -0.142604     0.093265   
33    -0.022568     0.055794     0.006754    -0.142604     0.093265   
34    -0.022568     0.055794     0.006754    -0.142604     0.093265   
35    -0.022568     0.055794     0.006754    -0.142604     0.093265   
36    -0.022568     0.055794     0.006754    -0.142604     0.093265   

    embedding_5  embedding_6  embedding_7  embedding_8  embedding_9  ...  \
32    -0.117611    -0.049071     0.056726      0.06399    -0.056176  ...   
33    -0.117611    -0.049071     0.056726      0.06399    -0.056176  ...   
34    -0.117611    -0.049071     0.056726      0.06399    -0.056176  ...   
35    -0.117611    -0.049071     0.056726      0.06399    -0.056176  ...   
36    -0.117611    -0.049071     0.056726      0.06399    -0.056176  ...   

    embedding_1526  embedding_1527  embedding_1528  embedding_1529  \
32        0.058187        0.039910       -0.03

In [None]:
# Read it in as a numpy array 
# result = np.load('result_train_distilroberta.npy')
# result2 = np.load('result_test_distilroberta.npy')

In [14]:
# Converting from DASK to Pandas 
task_2_train = task_2_train.compute()
task_2_test = task_2_test.compute()

In [55]:
# this should all be the same: pandas.core.frame.DataFrame
print(type(task_2_train))
print(type(task_2_test))
print(type(result_train))
print(type(result_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


# Creating the Multi-Layer Preceptron Model

In [79]:
# mlp with maxpooling 
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)  
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(hidden_size // 2, num_classes)

    def forward(self, x):
        # flatten the input 
        x = torch.flatten(x, 1) 
        x = self.fc1(x)
        x = self.relu(x)
        x = x.unsqueeze(1)
        x = self.pool(x)  
        # flatten the pooled output 
        x = x.view(x.size(0), -1) 
        # apply the 10% dropout 
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [80]:
# Inputting the Parameters

# Size of the concatenated embeddings(768 + 768)
input_size = 1536  
hidden_size = 128
# number of classes Exact, Substitute, Complement, Irrelevant (4)
num_classes = 4 

# initialize the model, loss, and optimizer
model = MLP(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()

# for the training hyperparameter configuration 
# Adam Optimizer with parameters: epsilon (1e-8), learning rate (5e-5) and weight decay (0.01)
optimizer = optim.Adam(model.parameters(), lr=5e-5, eps=1e-8, weight_decay=0.01)


# Data Loading 

## Labeling Indicies

In [30]:
task_2_train_indices = task_2_train.index.astype(int)
subset_labels = task_2_train['encoded_labels']
subset_labels = subset_labels.to_frame()

In [35]:
task_2_test_indices = task_2_test.index.astype(int)
subset_labels2 = task_2_test['encoded_labels']
subset_labels2 = subset_labels2.to_frame()

In [32]:
# not needed if not sampling 
result_train = result_train.sort_index()
result_test = result_test.sort_index()

In [36]:
# checking the training labels 
print(subset_labels)

         encoded_labels
0                     3
1                     0
2                     0
3                     0
4                     0
...                 ...
1818820               3
1818821               0
1818822               0
1818823               3
1818824               0

[1393063 rows x 1 columns]


In [37]:
# checking the test labels 
print(subset_labels2)

         encoded_labels
32                    3
33                    3
34                    0
35                    1
36                    1
...                 ...
1818788               0
1818789               3
1818790               3
1818791               3
1818792               0

[425762 rows x 1 columns]


In [58]:
class ESCIDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings.values
        # this should be (size, 1563)
        print("Shape of embeddings:", self.embeddings.shape)
        self.labels = labels   

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# create DataLoader
train_dataset = ESCIDataset(embeddings=result_train, labels=subset_labels['encoded_labels'].values)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # adjust the batch size as needed 

Shape of embeddings: (1393063, 1536)


In [59]:
# making the test loader 
test_dataset = ESCIDataset(embeddings=result_test, labels=subset_labels2['encoded_labels'].values)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) 

Shape of embeddings: (425762, 1536)


In [52]:
# these should be the same length 
print("Length of embeddings:", len(train_dataset.embeddings))
print("Length of labels:", len(train_dataset.labels))

Length of embeddings: 1393063
Length of labels: 1393063


In [62]:
# these should both be 'numpy.ndarray' or theres a problem 
print("Type of embeddings:", type(train_dataset.embeddings))
print("Type of labels:", type(train_dataset.labels))

Type of embeddings: <class 'numpy.ndarray'>
Type of labels: <class 'numpy.ndarray'>


In [60]:
# look at the samples to double check everything is looking right 
for i in range(5):  
    embedding, label = train_dataset[i]
    print(f"sample {i} - embedding: {embedding}, label: {label}")

sample 0 - embedding: [ 0.01227102  0.06289895 -0.00293979 ... -0.03747227 -0.05104589
  0.02997449], label: 3
sample 1 - embedding: [ 0.01227102  0.06289895 -0.00293979 ... -0.00329658 -0.04561577
  0.02704949], label: 0
sample 2 - embedding: [ 0.01227102  0.06289895 -0.00293979 ...  0.00563772 -0.04765408
  0.0202062 ], label: 0
sample 3 - embedding: [ 0.01227102  0.06289895 -0.00293979 ... -0.05716165 -0.03097533
  0.01906974], label: 0
sample 4 - embedding: [ 0.01227102  0.06289895 -0.00293979 ... -0.00906342 -0.03858491
  0.04480164], label: 0


In [81]:
# training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=4):
    model.train()  # set model to training mode
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch_idx, (embeddings, labels) in enumerate(train_loader):
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()  
            outputs = model(embeddings.float()) 
            # converting the labels to long 
            labels = labels.long()
            # calculate the loss 
            loss = criterion(outputs, labels) 
            # backpropogation 
            loss.backward() 
            # updating the weights 
            optimizer.step()  

            # add up the loss 
            epoch_loss += loss.item()  

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_loader):.4f}")

train_model(model, train_loader, criterion, optimizer)


Epoch 1/4, Loss: 0.8615
Epoch 2/4, Loss: 0.8586
Epoch 3/4, Loss: 0.8577
Epoch 4/4, Loss: 0.8573


In [82]:
# evaluation and output the f1 score 
def evaluate_model(test_loader, model):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            # needed to change this to floats 
            inputs = inputs.float().to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
            
    # evaluate on the f1 score with micro averages
    return f1_score(all_labels, all_preds, average='micro')

In [83]:
# evaluating the model
f1 = evaluate_model(test_loader, model)
print(f'micro F1 Score: {f1:.4f}')

micro F1 Score: 0.6514


# Finding Mismatches

In [84]:
def evaluate_and_capture_mismatches(test_loader, model, task_2_test):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.float().to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # convert task_2_test to pandas df if it's a dask df
    if hasattr(task_2_test, 'compute'):
        test_df = task_2_test[['query', 'product_title', 'encoded_labels']].compute()
    else:
        test_df = task_2_test[['query', 'product_title', 'encoded_labels']]

    test_df['predicted_label'] = all_preds
    test_df['true_label'] = all_labels
    
    mismatch_df = test_df[test_df['true_label'] != test_df['predicted_label']]
    
    return mismatch_df

mismatch_df = evaluate_and_capture_mismatches(test_loader, model, task_2_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_label'] = all_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['true_label'] = all_labels


In [85]:
# count top 10 mismatches per query
mismatch_counts_per_query = mismatch_df['query'].value_counts().head(10) 
mismatch_counts_per_product = mismatch_df['product_title'].value_counts().head(10)

all_text = ' '.join(mismatch_df['query'].tolist() + mismatch_df['product_title'].tolist())
word_counts = Counter(all_text.split()).most_common(10)  # Top 10 common words

print("Top 10 queries with the most mismatches:\n", mismatch_counts_per_query)
print("\nTop 10 most common words in mismatched entries:\n", word_counts)

Top 10 queries with the most mismatches:
 query
fitbit charge 3                            65
apple earbuds                              60
firestick                                  56
airpods 2                                  53
dek pro                                    48
futon frames full size without mattress    48
kindle                                     46
keep grinding hat                          42
marvel against humanity game               42
shaggy dog board game                      42
Name: count, dtype: int64[pyarrow]

Top 10 most common words in mismatched entries:
 [('for', 66849), ('-', 45241), ('with', 39985), ('and', 38192), ('&', 19424), ('of', 15107), ('|', 11940), ('Black', 11808), ('without', 10697), ('to', 9763)]


# Using the alldistillroberta-v1

In [28]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-distilroberta-v1')
model = AutoModel.from_pretrained('sentence-transformers/all-distilroberta-v1').to(device)

for param in model.parameters():
    param.requires_grad = False

def generate_embeddings(texts):
    batch_size = 128  # adjust this size
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

def process_partition(partition):
    query_embeddings = generate_embeddings(partition['query'])
    product_title_embeddings = generate_embeddings(partition['product_title'])

    combined = torch.cat((torch.tensor(query_embeddings), torch.tensor(product_title_embeddings)), dim=1).numpy()
    
    print(f'Combined shape: {combined.shape}')  # Expecting (n, 1536)

    result = pd.DataFrame(combined, index=partition.index, columns=[f'embedding_{i}' for i in range(combined.shape[1])])

    return result

In [29]:
# creating a data frame and enerate column names
meta = pd.DataFrame(columns=[f'embedding_{i}' for i in range(2 * 768)], dtype='float64')

# Ignore once embeddings are created 

In [42]:
# Computing the Embeddings for Train Data 
# result = task_2_train.map_partitions(process_partition, meta=meta)
# result = result.compute()

In [17]:
# Read it to a CSV 
result.to_csv('result_train_all-distilroberta-v1.csv')

In [18]:
# Read it to a NUMPY 
result_array = result.to_numpy()
np.save('result_train_all-distilroberta-v1.npy', result_array)

In [19]:
# Computing the Embeddings for the Test Data
result2 = task_2_test.map_partitions(process_partition, meta=meta)
result2 = result2.compute()

Combined shape: (425762, 1536)


In [20]:
# Read it to a CSV 
result2.to_csv('result_test_all-distilroberta-v1.csv')

In [21]:
# Read it to a NUMPY 
result2_array = result2.to_numpy()
np.save('result_test_all-distilroberta-v1.npy', result2_array)

# Data loading the all-distilroberta Data

In [41]:
# read in the train and test 
result_train = pd.read_csv('result_train_all-distilroberta-v1.csv', index_col=0)
result_test = pd.read_csv('result_test_all-distilroberta-v1.csv', index_col=0)

In [56]:
# Converting from DASK to Pandas 
# task_2_train = task_2_train.compute()
# task_2_test = task_2_test.compute()

In [34]:
# this should all be the same: pandas.core.frame.DataFrame
print(type(task_2_train))
print(type(task_2_test))
print(type(result_train))
print(type(result_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


# Establishing the MLP

In [44]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)  
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(hidden_size // 2, num_classes)

    def forward(self, x):
        # flatten the input 
        x = torch.flatten(x, 1) 
        x = self.fc1(x)
        x = self.relu(x)
        x = x.unsqueeze(1)
        x = self.pool(x)  
        # flatten the pooled output 
        x = x.view(x.size(0), -1) 
        # apply the 10% dropout 
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [45]:
# Inputting the Parameters

# Size of the concatenated embeddings(768 + 768)
input_size = 1536  
hidden_size = 128
# number of classes Exact, Substitute, Complement, Irrelevant (4)
num_classes = 4 

# initialize the model, loss, and optimizer
model = MLP(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()

# for the training hyperparameter configuration 
# Adam Optimizer with parameters: epsilon (1e-8), learning rate (5e-5) and weight decay (0.01)
optimizer = optim.Adam(model.parameters(), lr=5e-5, eps=1e-8, weight_decay=0.01)

# Labeling Indicies and Data loading

In [46]:
task_2_train_indices = task_2_train.index.astype(int)
subset_labels = task_2_train['encoded_labels']
subset_labels = subset_labels.to_frame()

In [47]:
task_2_test_indices = task_2_test.index.astype(int)
subset_labels2 = task_2_test['encoded_labels']
subset_labels2 = subset_labels2.to_frame()

In [48]:
# not needed if not sampling 
result_train = result_train.sort_index()
result_test = result_test.sort_index()

In [49]:
class ESCIDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings.values
        # this should be (size, 1563)
        print("Shape of embeddings:", self.embeddings.shape)
        self.labels = labels   

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# create DataLoader
train_dataset = ESCIDataset(embeddings=result_train, labels=subset_labels['encoded_labels'].values)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # adjust the batch size as needed 

Shape of embeddings: (1393063, 1536)


In [50]:
# making the test loader 
test_dataset = ESCIDataset(embeddings=result_test, labels=subset_labels2['encoded_labels'].values)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) 

Shape of embeddings: (425762, 1536)


# Training and Testing 

In [51]:
# training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=4):
    model.train()  # set model to training mode
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch_idx, (embeddings, labels) in enumerate(train_loader):
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()  
            outputs = model(embeddings.float()) 
            # converting the labels to long 
            labels = labels.long()
            # calculate the loss 
            loss = criterion(outputs, labels) 
            # backpropogation 
            loss.backward() 
            # updating the weights 
            optimizer.step()  

            # add up the loss 
            epoch_loss += loss.item()  

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_loader):.4f}")

train_model(model, train_loader, criterion, optimizer)

Epoch 1/4, Loss: 0.8032
Epoch 2/4, Loss: 0.7879
Epoch 3/4, Loss: 0.7838
Epoch 4/4, Loss: 0.7817


In [52]:
# evaluation and output the f1 score 
def evaluate_model(test_loader, model):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            # needed to change this to floats 
            inputs = inputs.float().to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
            
    # evaluate on the f1 score with micro averages
    return f1_score(all_labels, all_preds, average='micro')

In [53]:
# evaluating the model
f1 = evaluate_model(test_loader, model)
print(f'micro F1 Score: {f1:.4f}')

micro F1 Score: 0.6577


# Finding Mismatches

In [54]:
def evaluate_and_capture_mismatches(test_loader, model, task_2_test):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.float().to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # convert task_2_test to pandas df if it's a dask df
    if hasattr(task_2_test, 'compute'):
        test_df = task_2_test[['query', 'product_title', 'encoded_labels']].compute()
    else:
        test_df = task_2_test[['query', 'product_title', 'encoded_labels']]

    test_df['predicted_label'] = all_preds
    test_df['true_label'] = all_labels
    
    mismatch_df = test_df[test_df['true_label'] != test_df['predicted_label']]
    
    return mismatch_df

mismatch_df = evaluate_and_capture_mismatches(test_loader, model, task_2_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_label'] = all_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['true_label'] = all_labels


In [55]:
# count top 10 mismatches per query
mismatch_counts_per_query = mismatch_df['query'].value_counts().head(10) 
mismatch_counts_per_product = mismatch_df['product_title'].value_counts().head(10)

all_text = ' '.join(mismatch_df['query'].tolist() + mismatch_df['product_title'].tolist())
word_counts = Counter(all_text.split()).most_common(10)  # Top 10 common words

print("Top 10 queries with the most mismatches:\n", mismatch_counts_per_query)
print("\nTop 10 most common words in mismatched entries:\n", word_counts)

Top 10 queries with the most mismatches:
 query
fitbit charge 3                            65
apple earbuds                              60
firestick                                  56
dek pro                                    48
futon frames full size without mattress    48
airpods 2                                  46
kindle                                     46
keep grinding hat                          42
shaggy dog board game                      42
apple earphones                            40
Name: count, dtype: int64[pyarrow]

Top 10 most common words in mismatched entries:
 [('for', 65863), ('-', 44285), ('with', 39118), ('and', 37200), ('&', 19004), ('of', 14631), ('Black', 11500), ('|', 11455), ('without', 10240), ('2', 9326)]
