In [None]:
!pip install pygaggle==0.0.2
!pip install transformers==4.17.0
!pip install jsonlines

In [None]:
import os
import json
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset
import jsonlines
import argparse
import transformers
import pygaggle
from pygaggle.rerank.transformer import MonoT5
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)

In [None]:
#python file to fine tune the mono T5 Model
%%writefile fine_tune.py
import os
import json
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset
import jsonlines
import argparse
import transformers
import pygaggle
from pygaggle.rerank.transformer import MonoT5
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)


class MonoT5Dataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample = self.data[idx]
    text = f'Query: {sample[0]} Document: {sample[1]} Relevant:'
    return {
      'text': text,
      'labels': sample[2],
    }

def main():
  parser = argparse.ArgumentParser()
  parser.add_argument("--base_model", default='castorini/monot5-base-msmarco', type=str, required=False,
                      help="Base model to fine tune.")
  parser.add_argument("--triples_path", default=None, type=str, required=True,
                      help="Triples.tsv path")
  parser.add_argument("--output_model_path", default=None, type=str, required=True,
                      help="Path for trained model and checkpoints.")
  parser.add_argument("--save_every_n_steps", default=0, type=int, required=False,
                      help="Save every N steps. (recommended 10000)")
  parser.add_argument("--logging_steps", default=100, type=int, required=False,
                      help="Logging steps parameter.")
  parser.add_argument("--per_device_train_batch_size", default=8, type=int, required=False,
                      help="Per device batch size parameter.")
  parser.add_argument("--gradient_accumulation_steps", default=16, type=int, required=False,
                      help="Gradient accumulation parameter.")
  parser.add_argument("--learning_rate", default=3e-4, type=float, required=False,
                      help="Learning rate parameter.")
  parser.add_argument("--epochs", default=10, type=int, required=False,
                      help="Number of epochs to train")

  device = torch.device('cuda')
  torch.manual_seed(123)
  args = parser.parse_args()

  model = AutoModelForSeq2SeqLM.from_pretrained(args.base_model)
  tokenizer = AutoTokenizer.from_pretrained('castorini/monot5-base-msmarco')

  train_samples = []
  fIn=pd.read_csv(args.triples_path)
  for num, p in enumerate(fIn.iterrows()):
    if num > 6.4e5 *10:
        break
    if p[1].rel>1:
      query, positive =  p[1].Title,p[1].contents
      train_samples.append((query, positive, 'true'))
    else:
      query, negative =  p[1].Title,p[1].contents
      train_samples.append((query, negative, 'false'))

  def smart_batching_collate_text_only(batch):
    texts = [example['text'] for example in batch]
    tokenized = tokenizer(texts, padding=True, truncation='longest_first', return_tensors='pt', max_length=512)
    tokenized['labels'] = tokenizer([example['labels'] for example in batch], return_tensors='pt')['input_ids']
    for name in tokenized:
      tokenized[name] = tokenized[name].to(device)

    return tokenized

  dataset_train = MonoT5Dataset(train_samples)

  if args.save_every_n_steps:
    steps = args.save_every_n_steps
    strategy = 'steps'
  else:
    steps = 1
    strategy = 'epoch'

  train_args = Seq2SeqTrainingArguments(
      output_dir=args.output_model_path,
      do_train=True,
      save_strategy=strategy,
      save_steps =steps, 
      logging_steps=args.logging_steps,
      per_device_train_batch_size=args.per_device_train_batch_size,
      gradient_accumulation_steps=args.gradient_accumulation_steps,
      learning_rate=args.learning_rate,
      weight_decay=5e-5,
      num_train_epochs=1,
      warmup_steps=1000,
      adafactor=True,
      seed=1,
      disable_tqdm=False,
      load_best_model_at_end=False,
      predict_with_generate=True,
      dataloader_pin_memory=False,
  )

  trainer = Seq2SeqTrainer(
      model=model,
      args=train_args,
      train_dataset=dataset_train,
      tokenizer=tokenizer,
      data_collator=smart_batching_collate_text_only,
  )

  trainer.train()

  trainer.save_model(args.output_model_path)
  trainer.save_state()

if __name__ == "__main__":
  main()

Overwriting fine_tune.py


In [None]:
#running the fine tuning
!python3 fine_tune.py --triples_path "/content/drive/MyDrive/Touche/2020/merged_new_rel_2020.csv" --output_model_path "/content/drive/MyDrive/Touche/fine_tune"

Downloading: 100% 1.80k/1.80k [00:00<00:00, 1.70MB/s]
Downloading: 100% 850M/850M [00:23<00:00, 37.5MB/s]
Downloading: 100% 1.84k/1.84k [00:00<00:00, 2.91MB/s]
Downloading: 100% 773k/773k [00:00<00:00, 42.8MB/s]
Downloading: 100% 1.74k/1.74k [00:00<00:00, 2.89MB/s]
***** Running training *****
  Num examples = 1633
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 16
  Total optimization steps = 12
100% 12/12 [05:15<00:00, 29.78s/it]Saving model checkpoint to /content/drive/MyDrive/Touche/fine_tune/checkpoint-12
Configuration saved in /content/drive/MyDrive/Touche/fine_tune/checkpoint-12/config.json
Model weights saved in /content/drive/MyDrive/Touche/fine_tune/checkpoint-12/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Touche/fine_tune/checkpoint-12/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Touche/fine_tune/checkpoi

In [None]:
#loading the fine_tuned model
from transformers import T5ForConditionalGeneration
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5, DuoT5
model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/Touche/fine_tune')
reranker = MonoT5(model=model)

In [None]:
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5, DuoT5
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Touche/fine_tune")
reranker =  MonoT5(model)

In [None]:
relevant_bm25=pd.read_csv("/content/drive/MyDrive/Touche/2020/relevant_bm25_2020.csv")
relevant_bm25.head()

Unnamed: 0.1,Unnamed: 0,title_id,title,doc_id,score,content
0,0,1,\nWhat is the difference between sex and love?\n,clueweb12-0818wb-26-13074,7.1866,toofly nyc » hip hop culture toofly nyc clien...
1,1,1,\nWhat is the difference between sex and love?\n,clueweb12-0010wb-33-27298,7.1624,relational quantum mechanics stanford encyclo...
2,2,1,\nWhat is the difference between sex and love?\n,clueweb12-0103wb-41-14265,6.9118,lecture series science software engineering...
3,3,1,\nWhat is the difference between sex and love?\n,clueweb12-1013wb-51-17839,6.4538,ethical buyers guide toy cars trains planes s...
4,4,1,\nWhat is the difference between sex and love?\n,clueweb12-1400tw-39-23968,6.2936,video video cbr tv movie trailersclips tv tra...


In [None]:
new=[]
for title in tqdm(relevant_bm25["title"].unique()):
  passages= relevant_bm25.loc[relevant_bm25["title"]==title]
  texts = [ Text(p[1].content, {'docid': p[1].doc_id}, 0) for p in passages.iterrows()]
  query = Query(title)
  reranked = reranker.rerank(query, texts)
  for i in range(0, 1000):
    new.append({"doc_id":reranked[i].metadata["docid"], "Score":reranked[i].score, "title":title})

In [None]:
#retreiving 10 documents it takes 20 seconds so retrieving 1000 documents per query may take 30 mins
%%time
new=[]
title=relevant_bm25["title"][0]
passages= relevant_bm25.loc[relevant_bm25["title"]==title]
texts = [ Text(p[1].content, {'docid': p[1].doc_id}, 0) for p in passages.iterrows()]
query = Query(title)
texts=texts[0:10]
reranked = reranker.rerank(query, texts)
for i in range(0, 10):
  new.append({"doc_id":reranked[i].metadata["docid"], "Score":reranked[i].score, "title":title})

CPU times: user 20.1 s, sys: 195 ms, total: 20.3 s
Wall time: 22.4 s


#The input dataset for fine tuning

In [None]:
new_rel_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/new_rel_2020.csv")
new_rel_2020["qid"] = new_rel_2020["qid"].astype(str)
new_rel_2020=new_rel_2020.drop("Unnamed: 0", axis=1)
new_rel_2020.head()

In [None]:
topics_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/Topics.csv")

In [None]:
docs_2020=pd.read_csv("/content/drive/MyDrive/Touche/2020/docs_2020.csv")

In [None]:
merged_dataset=new_rel_2020.join(docs_2020.set_index('old_id'),on="doc").drop(["no","Unnamed: 0","doc"], axis=1)

In [None]:
merged_dataset=merged_dataset.join(topics_2020.set_index("Number"),on="qid").drop("qid",axis=1)

In [None]:
merged_dataset.to_csv("/content/drive/MyDrive/Touche/2020/merged_new_rel_2020.csv")

In [None]:
merged_dataset=pd.read_csv("/content/drive/MyDrive/Touche/2020/merged_new_rel_2020.csv")