## Part 2 - Neural Re-Ranking **40 points**

Implement 2 neural architectures based on the kernel-pooling paradigm to perform re-ranking in ``src/re_ranking.py`` (KNRM, TK)

- Implement: the 2 (KNRM, TK) model classes **20 points**
   - Show that you understood what happens by adding comments to difficult parts of the model (what tensor dimensions represent, what gets summed up, etc..)
- Implement: training process & result evaluation **10 points**
    - Including early stopping based on the validation set
	   - Use the **msmarco_tuples.validation.tsv** input to feed the neural models and **msmarco_qrels.txt** qrels to evaluate the output
- Evaluate: Compute a test set evaluation at the end  **10 points**
	- MS-MARCO sparse labels
	  - Use the **msmarco_tuples.test.tsv** input to feed the neural models and **msmarco_qrels.txt** qrels to evaluate the output
	- FiRA-2022 fine-grained labels on out-of-domain data
	  - Use your created created labels from part 1
	     - Use the **fira-2022.tuples.tsv** input to feed the neural models and your qrels from part 1 to evaluate the output
	  - Compare these results with our baseline label creation method
	     - Use the **fira-2022.tuples.tsv** input to feed the neural models and **fira-2022.baseline-qrels.tsv** qrels to evaluate the output
	  - Explore & describe the differences in metrics between the baseline and your label creation 

## Provided data:
* AllenNLP vocabulary (collection specific, in two sizes: use the _10 = min of 10 occurrences in the collection if you have memory problems with the _5)
* train triples
* evaluation tuples (validation & test) with 2.000 queries each and the top 40 BM25 results per query, relevance judgments (qrels, one file covering both validation & test)

In [2]:
#Put in your basepath like for example "/home/studio-lab-user/src/data_part2"
base_path = "../"

In [3]:
from typing import Dict, Iterator, List

import torch
import torch.nn as nn
from torch.autograd import Variable

from allennlp.modules.text_field_embedders import TextFieldEmbedder

import sys
sys.path.append('..')

from src.data_loading import IrTripleDatasetReader, IrLabeledTupleDatasetReader
import numpy as np


from allennlp.data.vocabulary import Vocabulary

from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

from src.data_loading import *
# from src.model_knrm import *
from src.model_tk import *
from allennlp.data.dataloader import PyTorchDataLoader
import pandas as pd

import math

from typing import Dict, Iterator, List

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import GELU


from allennlp.modules.text_field_embedders import TextFieldEmbedder

import os

from src.model_tk import TK

from src.BatchWordEmbedder import *

In [4]:
#Import your local base path

import os

config = {
    "vocab_directory": os.path.join(base_path, "data/Part-2/allen_vocab_lower_10"),
    "pre_trained_embedding": os.path.join(base_path, "data/Part-2/glove.42B.300d.txt"),
    "model": "tk",
    "train_data": os.path.join(base_path, "data/Part-2/triples.train.tsv"),
    "validation_data": os.path.join(base_path, "data/Part-2/msmarco_tuples.validation.tsv"),
    "test_data": os.path.join(base_path, "data/Part-2/tuples.test.tsv"),
    "qrels": os.path.join(base_path, "data/Part-2/msmarco_qrels.txt"),
}

In [5]:
BATCH_SIZE = 64
vocab = Vocabulary.from_files(config["vocab_directory"])
tokens_embedder = Embedding(vocab=vocab,
                           pretrained_file= config["pre_trained_embedding"],
                           embedding_dim=300,
                           trainable=True,
                           padding_index=0,
                           )
word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder})

_triple_reader = IrTripleDatasetReader(lazy=True, max_doc_length=128, max_query_length=128)
_triple_reader = _triple_reader.read(config["train_data"])
_triple_reader.index_with(vocab)
loader = PyTorchDataLoader(_triple_reader, batch_size=BATCH_SIZE)

0it [00:00, ?it/s]

**Train the model**

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
batch_embedder = BatchWordEmbedder(word_embedder, device)

tk_config = {
            "n_kernels": 11,
            "max_len": 128,
            "n_heads": 4,
            "num_layers": 2,
            "hidden_dim": 128,
            "word_embedding_dim": 300,
            "dropout": 0.1,
            "debug" : False,
            "batch_size" : BATCH_SIZE,
        }

model_tk = TK(**tk_config).to(device)

max_iter = 500000
step_size = max_iter//BATCH_SIZE//2

training_parameters = {
    "optimizer_lr" : 1 * 1e-3,
    "optimizer_weight_decay": 1e-5, # don't use with scheduler
    "scheduler_step_size": step_size,
    "scheduler_gamma": 0.5, # for 10 reductions of lr it will reduce it to 0.1073741824 of the original value
    "training_max_iter": max_iter # or None to train on the entire training set
} 

# optimizer = torch.optim.Adam(model_tk.parameters(), lr=training_parameters["optimizer_lr"], weight_decay=training_parameters["optimizer_weight_decay"])
optimizer = torch.optim.Adam(model_tk.parameters(), lr = training_parameters["optimizer_lr"])
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    gamma = training_parameters["scheduler_gamma"],
    step_size = training_parameters["scheduler_step_size"]
)

loss_criterion = torch.nn.MarginRankingLoss(margin=1, reduction='elementwise_mean').to(device)
print(device)

cpu


In [7]:
model_tk_trained, loss_accumulator = tk_training_loop(model_tk, loader, optimizer, scheduler, loss_criterion, batch_embedder, device, training_parameters['training_max_iter'], epochs=1)

reading instances: 0it [00:00, ?it/s]



Epoch: 0, Batches: 0, Total triples: 0/500000, Average Loss: None, Current loss: [0.001]


KeyboardInterrupt: 

In [21]:
# save model
torch.save(model_tk_trained.state_dict(), "../models/model_tk_trained_500k.pt")

**Load the model**

In [12]:
# Define your device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the path to the pre-trained model
model_path = os.path.join(base_path, "models/model_tk_trained_500k.pt")

# Define the configuration for TK model initialization
tk_config = {
    "n_kernels": 11,
    "max_len": 128,
    "n_heads": 4,
    "num_layers": 2,
    "hidden_dim": 128,
    "word_embedding_dim": 300,
    "dropout": 0.1,
    "debug": False,
    "batch_size": BATCH_SIZE,  # Define BATCH_SIZE if it's not already defined
}

# Initialize the TK model
model_tk = TK(**tk_config).to(device)

# Load the model state_dict
model_tk.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

**MS Marco Sparse**

In [24]:
# read txt file
df_qrels = pd.DataFrame()
list_qrels = []
with open(config['qrels']) as f:
    lines = f.readlines()
    for line in lines:
        query_id, _, doc_id, _ = line.split()
        list_qrels.append([query_id, doc_id])
df_qrels = pd.DataFrame(list_qrels, columns=['query_id', 'doc_id'])
df_qrels['query_id'] = df_qrels['query_id'].astype(int)
df_qrels['doc_id'] = df_qrels['doc_id'].astype(int)

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
batch_embedder_eval = BatchWordEmbedder(word_embedder, device, train=False)

In [26]:
_triple_reader_eval = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=128, max_query_length=128)
_triple_reader_eval = _triple_reader_eval.read(config["validation_data"])
_triple_reader_eval.index_with(vocab)
loader_test = PyTorchDataLoader(_triple_reader_eval, batch_size=64)
# batch = next(iter(loader_test))
# batch_embedder_eval = BatchWordEmbedder(word_embedder, device, train=False)

In [27]:
def evaluate_model(model, df_qrels, loader, batch_embedder, device):
    model.eval()
    query_ids = []
    doc_ids = []
    preds = []
    with torch.no_grad():
        for idx, batch in enumerate(loader):
            query_emb, doc_pos_emb, _, query_pad_mask, document_pad_mask_pos, _ = batch_embedder(batch)
            pred = model(query_emb, doc_pos_emb, query_pad_mask, document_pad_mask_pos)
            query_ids.extend(batch['query_id'])  # Directly extend the list
            doc_ids.extend(batch['doc_id'])      # Directly extend the list
            preds.extend(pred.cpu().numpy().flatten())
            # if idx * 32 >= 10000:
            #     break
    df_eval = pd.DataFrame({
        'query_id': query_ids,
        'doc_id': doc_ids,
        'score': preds
    })
    
    # Assigning rank based on scores within each query_id group
    df_eval['rank'] = df_eval.groupby('query_id')['score'].rank(ascending=False, method='first').astype(int)
    
    # Sorting by query_id and rank
    df_eval = df_eval.sort_values(by=['query_id', 'rank'])
    
    return df_eval

In [29]:
df_eval = evaluate_model(model_tk, df_qrels, loader_test, batch_embedder_eval, device)

reading instances: 0it [00:00, ?it/s]

In [30]:
# Define the specific paths relative to the base path
path_result = os.path.join(base_path, "data/results_part2/tk_msmarco_ranking_final.tsv")
path_baseline = config['qrels']

# Write the DataFrame to a TSV file
df_eval.to_csv(path_result, sep='\t', header=False, index=False)

In [31]:
from src.core_metrics import calculate_metrics_plain,load_ranking,load_qrels

calculate_metrics_plain(load_ranking(path_result),load_qrels(path_baseline))

{'MRR@10': 0.2317813492063492,
 'Recall@10': 0.4657083333333334,
 'QueriesWithNoRelevant@10': 1050,
 'QueriesWithRelevant@10': 950,
 'AverageRankGoldLabel@10': 3.5652631578947367,
 'MedianRankGoldLabel@10': 3.0,
 'MRR@20': 0.236783314466751,
 'Recall@20': 0.5342916666666666,
 'QueriesWithNoRelevant@20': 910,
 'QueriesWithRelevant@20': 1090,
 'AverageRankGoldLabel@20': 4.963302752293578,
 'MedianRankGoldLabel@20': 3.0,
 'MRR@1000': 0.23825244675891408,
 'Recall@1000': 0.574625,
 'QueriesWithNoRelevant@1000': 830,
 'QueriesWithRelevant@1000': 1170,
 'AverageRankGoldLabel@1000': 6.558119658119658,
 'MedianRankGoldLabel@1000': 4.0,
 'nDCG@3': 0.21571062590580198,
 'nDCG@5': 0.251545146323757,
 'nDCG@10': 0.2858396956732744,
 'nDCG@20': 0.3036647891248599,
 'nDCG@1000': 0.3121836066107091,
 'QueriesRanked': 2000,
 'MAP@1000': 0.23467543810810212}

**fira-2022.tuples.tsv input to feed the neural models and fira-2022.baseline-qrels.tsv for evaluation**

In [36]:
config = {
    "vocab_directory": os.path.join(base_path, "data/Part-2/allen_vocab_lower_10"),
    "pre_trained_embedding": os.path.join(base_path, "data/Part-2/glove.42B.300d.txt"),
    "model": "tk",
    "train_data": os.path.join(base_path, "data/Part-2/triples.train.tsv"),
    "validation_data": os.path.join(base_path, "data/Part-2/fira-22.tuples_mod.tsv"),
    "test_data": os.path.join(base_path, "data/Part-2/tuples.test.tsv"),
    "qrels": os.path.join(base_path, "data/Part-2/fira-22.baseline-qrels.tsv"),
}

In [37]:

# Specify the path to your file

# Initialize an empty list to store [query_id, doc_id] pairs
list_qrels = []

# Read the file line by line and process each line
with open(config['qrels']) as f:
    for line in f:
        parts = line.strip().split()  # Split line by tab separator
        if len(parts) >= 4:  # Check if we have at least 4 parts
            query_id = parts[0]  # First column
            doc_id = parts[2]    # Third column
            list_qrels.append([query_id, doc_id])  # Append [query_id, doc_id] to list_qrels
        else:
            print(f"Warning: Skipping line with unexpected format: {line}")
            print(f"The were so many part: {len(parts)}")

# Create a DataFrame from list_qrels with columns 'query_id' and 'doc_id'
df_qrels = pd.DataFrame(list_qrels, columns=['query_id', 'doc_id'])

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
batch_embedder_eval = BatchWordEmbedder(word_embedder, device, train=False)

In [40]:
_tuple_reader_eval = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=512, max_query_length=512)
_tuple_reader_eval = _tuple_reader_eval.read(config['validation_data'])
_tuple_reader_eval.index_with(vocab)
loader_test = PyTorchDataLoader(_tuple_reader_eval, batch_size=64)
# batch = next(iter(loader_test))
# batch_embedder_eval = BatchWordEmbedder(word_embedder, device, train=False)

In [41]:
def evaluate_model(model, df_qrels, loader, batch_embedder, device):
    model.eval()
    query_ids = []
    doc_ids = []
    preds = []
    with torch.no_grad():
        for idx, batch in enumerate(loader):
            try:
                query_emb, doc_pos_emb, _, query_pad_mask, document_pad_mask_pos, _ = batch_embedder(batch)
                pred = model(query_emb, doc_pos_emb, query_pad_mask, document_pad_mask_pos)
                query_ids.extend(batch['query_id'])  # Directly extend the list
                doc_ids.extend(batch['doc_id'])      # Directly extend the list
                preds.extend(pred.cpu().numpy().flatten())
            except Exception as e:
                print(f"Error processing batch {idx}: {e}")
                print("Skipping batch with query_id and doc_id:")
                print("query_id:", batch['query_id'])
                print("doc_id:", batch['doc_id'])
                continue
            # if idx * 32 >= 10000:
            #     break
    print("Finished!")
    df_eval = pd.DataFrame({
        'query_id': query_ids,
        'doc_id': doc_ids,
        'score': preds
    })
    
    # Assigning rank based on scores within each query_id group
    df_eval['rank'] = df_eval.groupby('query_id')['score'].rank(ascending=False, method='first').astype(int)
    
    # Sorting by query_id and rank
    df_eval = df_eval.sort_values(by=['query_id', 'rank'])
    
    return df_eval




In [42]:
df_eval = evaluate_model(model_tk, df_qrels, loader_test, batch_embedder_eval, device)

reading instances: 0it [00:00, ?it/s]

Finished!


In [None]:
## Write the DataFrame to a TSV file
path_result= os.path.join(base_path, "data/results_part2/tk_fira_baseline_final.tsv")
path_baseline=config['qrels']
#Write this to the results
df_eval.to_csv(path_result, sep='\t', header=False, index=False)

In [44]:

from src.core_metrics import calculate_metrics_plain,load_ranking,load_qrels

calculate_metrics_plain(load_ranking(path_result),load_qrels(path_baseline))

{'MRR@10': 0.9584031198686371,
 'Recall@10': 0.9514386291048853,
 'QueriesWithNoRelevant@10': 115,
 'QueriesWithRelevant@10': 4060,
 'AverageRankGoldLabel@10': 1.0970443349753694,
 'MedianRankGoldLabel@10': 1.0,
 'MRR@20': 0.9584031198686371,
 'Recall@20': 1.0000527797325827,
 'QueriesWithNoRelevant@20': 115,
 'QueriesWithRelevant@20': 4060,
 'AverageRankGoldLabel@20': 1.0970443349753694,
 'MedianRankGoldLabel@20': 1.0,
 'MRR@1000': 0.9584031198686371,
 'Recall@1000': 1.0000527797325827,
 'QueriesWithNoRelevant@1000': 115,
 'QueriesWithRelevant@1000': 4060,
 'AverageRankGoldLabel@1000': 1.0970443349753694,
 'MedianRankGoldLabel@1000': 1.0,
 'nDCG@3': 0.8754531745450254,
 'nDCG@5': 0.8804138103150848,
 'nDCG@10': 0.9045552653589506,
 'nDCG@20': 0.9177484694976151,
 'nDCG@1000': 0.9177484694976151,
 'QueriesRanked': 4060,
 'MAP@1000': 0.9500647311370559}

**FIRA our own judgement**

In [45]:
path_baseline_own=os.path.join(base_path,"data/Part-1/fira-22.judgements-anonymized-aggregated_v1.tsv")
calculate_metrics_plain(load_ranking(path_result),load_qrels(path_baseline_own))

{'MRR@10': 0.9506052026916134,
 'Recall@10': 0.9521341793498367,
 'QueriesWithNoRelevant@10': 113,
 'QueriesWithRelevant@10': 4062,
 'AverageRankGoldLabel@10': 1.1164451009354996,
 'MedianRankGoldLabel@10': 1.0,
 'MRR@20': 0.9506052026916134,
 'Recall@20': 1.000052753745516,
 'QueriesWithNoRelevant@20': 113,
 'QueriesWithRelevant@20': 4062,
 'AverageRankGoldLabel@20': 1.1164451009354996,
 'MedianRankGoldLabel@20': 1.0,
 'MRR@1000': 0.9506052026916134,
 'Recall@1000': 1.000052753745516,
 'QueriesWithNoRelevant@1000': 113,
 'QueriesWithRelevant@1000': 4062,
 'AverageRankGoldLabel@1000': 1.1164451009354996,
 'MedianRankGoldLabel@1000': 1.0,
 'nDCG@3': 0.8660275945797691,
 'nDCG@5': 0.873464277441999,
 'nDCG@10': 0.8987960356282988,
 'nDCG@20': 0.9118463607058193,
 'nDCG@1000': 0.9118463607058193,
 'QueriesRanked': 4062,
 'MAP@1000': 0.9398758982965247}