In [1]:
import load_dataset
from sentence_transformers import SentenceTransformer
import pickle
import initial_ret
import cross_encoder
import pandas as pd
import evaluation
import torch
import timm
import time
from transformers import BertTokenizer, BertForSequenceClassification


In [2]:
collection, queries, qrels = load_dataset.load_subset()

In [3]:
doc_embeddings, query_embeddings = initial_ret.create_embeddings(collection, queries)

device:  cuda
Type of dataset:  <class 'pandas.core.frame.DataFrame'>


Batches: 100%|████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 14.06it/s]
Batches: 100%|████████████████████████████████████████████████████████| 313/313 [00:04<00:00, 78.14it/s]


(10000, 768)
(10000, 768)


In [4]:
initial_retrieval_results = initial_ret.perform_initial_ret(doc_embeddings, query_embeddings, queries, collection)

# Save results from first retrival
df = pd.DataFrame([(k, v) for k, v in initial_retrieval_results.items()], columns=['query_id', 'doc_ids'])
df.to_csv('ranked_results.csv', index=False)

In [5]:
initial_retrieval_results

{152325: [(5024694, 0.7414789),
  (5289784, 0.7340858),
  (1566602, 0.72509325),
  (4186104, 0.7070747),
  (695587, 0.6838347)],
 579883: [(6927405, 0.60753155),
  (2205830, 0.6065566),
  (492197, 0.57629585),
  (2686269, 0.5691979),
  (190671, 0.5667361)],
 713391: [(3500683, 0.61799824),
  (781145, 0.5917505),
  (6852536, 0.58177704),
  (5977696, 0.5802349),
  (190671, 0.5698788)],
 139145: [(6927405, 0.7737646),
  (6516883, 0.6584729),
  (6130491, 0.64392644),
  (7648850, 0.59592474),
  (4060669, 0.5928971)],
 527096: [(5059361, 0.63224244),
  (387250, 0.61890256),
  (190671, 0.6003678),
  (3153983, 0.5977372),
  (5040444, 0.59760845)],
 785348: [(165198, 0.6224726),
  (2574616, 0.6097312),
  (4047753, 0.5842938),
  (2716834, 0.58294946),
  (436343, 0.58221173)],
 629435: [(6643514, 0.49005353),
  (5349743, 0.48860243),
  (5435914, 0.48469803),
  (665098, 0.45302832),
  (1968078, 0.44662422)],
 696229: [(5234894, 0.65438795),
  (2559833, 0.6388512),
  (781145, 0.61338884),
  (769642

In [13]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

def create_model():
    # Initialize the cross-encoder model
    cross_encoder_model_name = 'nboost/pt-bert-base-uncased-msmarco'
    cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_model_name)
    tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_name)

    # Check if CUDA is available and move the model to CUDA if it is
    if torch.cuda.is_available():
        cross_encoder = cross_encoder.to('cuda')
    else:
        print("CUDA not available.")

    return cross_encoder, tokenizer

In [14]:

cross_encoder, tokenizer = create_model()

In [16]:
re_ranked_results = {}
from tqdm import tqdm
import time

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cross_encoder.to(device)

for qid, doc_data in tqdm(initial_retrieval_results.items(), desc="Processing"):
    query = queries[queries['qid'] == qid]['query'].iloc[0]
    scores = []

    for doc_id, _ in doc_data:  # Unpack the tuple to get the doc_id
        # Retrieve the document text using the doc_id
        if doc_id in collection['docid'].values:
            doc_text = collection[collection['docid'] == doc_id]['text'].iloc[0]

            # Prepare the input for the cross-encoder
            inputs = tokenizer.encode_plus(query, doc_text, return_tensors='pt', truncation=True, max_length=512)

            # Move inputs to the same device as the model
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Get the relevance score
            outputs = cross_encoder(**inputs)
            score = outputs.logits[0][1].item()  # Assuming binary classification (relevant, not relevant)

            # Store the score with the document ID
            scores.append((doc_id, score))
        else:
            print(f"Document ID {doc_id} not found in collection.")

    # Sort the documents for the query based on the scores
    scores.sort(key=lambda x: x[1], reverse=True)
    re_ranked_results[qid] = [doc_id for doc_id, _ in scores]


Processing: 100%|█████████████████████████████████████████████████| 10000/10000 [06:12<00:00, 26.84it/s]


In [17]:
re_ranked_results

{152325: [5024694, 695587, 4186104, 1566602, 5289784],
 579883: [190671, 2205830, 492197, 6927405, 2686269],
 713391: [6852536, 3500683, 781145, 190671, 5977696],
 139145: [6927405, 6516883, 4060669, 7648850, 6130491],
 527096: [387250, 3153983, 5059361, 190671, 5040444],
 785348: [436343, 2574616, 165198, 4047753, 2716834],
 629435: [5435914, 665098, 6643514, 1968078, 5349743],
 696229: [5234894, 2559833, 7529045, 781145, 769642],
 577747: [8795843, 781145, 5059361, 2205830, 190671],
 1025712: [6752581, 7570091, 338789, 454090, 5081554],
 704132: [409551, 2574616, 781145, 2686269, 6516883],
 30347: [6315475, 6814967, 6927405, 1509148, 7020298],
 50040: [1849292, 6612708, 4235562, 1065152, 6340328],
 788782: [5977696, 7529045, 5059361, 2559833, 781145],
 983227: [1263035, 138544, 2541391, 5232548, 8795843],
 693066: [2574616, 2686269, 6927405, 6790327, 6516883],
 1041343: [3354807, 5964484, 6008311, 5794669, 216942],
 324168: [4847364, 3919218, 2625521, 1889656, 5899628],
 708149: [268

In [18]:
evaluation.eval(qrels, re_ranked_results)

Average Precision: 0.06176470588235288, Average Recall: 0.3088235294117647, Average F1 Score: 0.1029411764705882


In [21]:
df = pd.DataFrame([(k, v) for k, v in re_ranked_results.items()], columns=['query_id', 'doc_ids'])
df.to_csv('re_ranked_results.csv', index=False)

In [22]:
doc_id_to_find = 5024694
document = collection[collection['docid'] == doc_id_to_find]

In [23]:
document

Unnamed: 0,docid,text
5024694,5024694,1 kaposi sarcoma is a disease in which maligna...


In [24]:
import importlib
importlib.reload(load_dataset) #refresh

<module 'load_dataset' from '/home/skenderadm/Desktop/monoduobert/AIR-WS-23-24-main/load_dataset.py'>

In [25]:
from transformers import AutoModelForSequenceClassification
# Model that can be re-used in experiments
from transformers import AutoModel, AutoTokenizer

model_name = "veneres/monobert-msmarco"  # Monobert MS MARCO
tokenizer_monobert = AutoTokenizer.from_pretrained(model_name)
model_monobert = AutoModelForSequenceClassification.from_pretrained(model_name)
model_monobert.to('cuda' if torch.cuda.is_available() else 'cpu') # check for CUDA


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [26]:
reranked_results = {}
# Start timing
start_time = time.time()

from tqdm import tqdm
import time 

for query_id, doc_indices in  tqdm(initial_retrieval_results.items(), desc="Processing"):

    query_text = queries.loc[queries['qid'] == query_id, 'query'].iloc[0]
    doc_texts = [collection.loc[collection['docid'] == doc_id[0], 'text'].iloc[0] for doc_id in doc_indices]

    # Check query anc calculate score
    scores = []
    for doc_text in doc_texts:
        inputs = tokenizer_monobert.encode_plus(query_text, doc_text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(model_monobert.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model_monobert(**inputs)
            score = outputs.logits[0][1].item()  # TODO We find score here?
            scores.append(score)

    # Sort the documents by their new scores
    sorted_doc_indices = [x for _, x in sorted(zip(scores, doc_indices), reverse=True)]


    # Store the reranked results
    reranked_results[query_id] = sorted_doc_indices
    
    
# End timing
end_time = time.time()

# Calculate the total time taken
total_time = end_time - start_time
print("Total time needed : ", total_time)

Processing: 100%|█████████████████████████████████████████████████| 10000/10000 [05:06<00:00, 32.65it/s]

Total time:  306.28992438316345





In [27]:
from transformers import T5ForConditionalGeneration, T5Tokenizer


model_name_t5 = "castorini/duot5-base-msmarco" # Get T5 fine-tuned for MS MARCO

# Load the tokenizer and the model
tokenizer_t5 = T5Tokenizer.from_pretrained(model_name_t5)
model_t5 = T5ForConditionalGeneration.from_pretrained(model_name_t5)


model_t5.to('cuda' if torch.cuda.is_available() else 'cpu') # Check if there is CUDA

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [28]:
# Start timing for T5 re-ranking
start_time_t5 = time.time()

# Dictionary to store the final re-re-ranked results
final_reranked_results = {}

for query_id, doc_indices in tqdm(reranked_results.items(), desc="T5 Re-Ranking"):
    
    query_text = queries.loc[queries['qid'] == query_id, 'query'].iloc[0]

    # Scoring each document with respect to the query using T5
    t5_scores = []
    for doc_id_tuple in doc_indices:
        # Assuming the actual doc_id is the first element of the tuple
        doc_id = doc_id_tuple[0]  
        doc_text = collection.loc[collection['docid'] == doc_id, 'text'].iloc[0]
        input_text = f"Query: {query_text} Document: {doc_text}"
        input_ids = tokenizer_t5.encode(input_text, return_tensors="pt", truncation=True, max_length=512)

        # Move input to the same device as the model
        input_ids = input_ids.to(model_t5.device)

        with torch.no_grad():
            outputs = model_t5.generate(input_ids)
            # TODO
            t5_score = ... # Convert outputs to a score TODO
            t5_scores.append(t5_score)

    # Then sort the documents by their new T5 scores
    final_sorted_doc_indices = [x for _, x in sorted(zip(t5_scores, doc_indices), reverse=True)]

    # Store the final re-re-ranked results
    final_reranked_results[query_id] = final_sorted_doc_indices

# End timing
end_time_t5 = time.time()

# Calculate the total time taken for T5 re-ranking
total_time_t5 = end_time_t5 - start_time_t5
print("Total time for T5 re-ranking: ", total_time_t5)


T5 Re-Ranking: 100%|██████████████████████████████████████████████| 10000/10000 [21:35<00:00,  7.72it/s]

Total time for T5 re-ranking:  1295.3561301231384





In [54]:
importlib.reload(evaluation)

<module 'evaluation' from '/home/skenderadm/Desktop/monoduobert/AIR-WS-23-24-main/evaluation.py'>

In [69]:
transformed_data_first_rereanking = {query_id: [doc_id for doc_id, _ in doc_list] for query_id, doc_list in reranked_results.items()}


evaluation.calculate_mrr(transformed_data_first_rereanking, qrels)

Mean Reciprocal Rank (MRR): 0.30392156862745096


In [70]:
evaluation.eval(qrels, transformed_data_first_rereanking)

Average Precision: 0.29901960784313725, Average Recall: 0.29901960784313725, Average F1 Score: 0.29901960784313725


In [None]:
transformed_data_final = {query_id: [doc_id for doc_id, _ in doc_list] for query_id, doc_list in final_reranked_results.items()}
evaluation.calculate_mrr(transformed_data_final, qrels)
evaluation.eval(qrels, transformed_data_final)