In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import os
root_dir = "/content/drive/My Drive/BS17B025_DDP/GPL/labelled_data"
os.chdir(root_dir)
!pwd

In [None]:
folders = [f'_corpus{i}' for i in range(100)]
for folder in folders:
    path = os.path.join(root_dir,folder)
    os.makedirs(path)

In [None]:
!pip install datasets -q
!pip install huggingface_hub -q

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset
pubmed = load_dataset('ddp-iitm/pubmed_raw_text_v3', use_auth_token=True, streaming=True, split='train')

In [None]:
import re

pattern = r"[^a-z, 0-9.()]"
re_pattern = re.compile(pattern, re.I)

def cleanup(example,idx):
    modified_example = {}
    modified_example['text'] = re.sub(re_pattern,"",example['text']).replace("  ", " ").replace('\t', ' ').replace('\n', ' ').strip()
    modified_example['_id'] = str(idx)
    modified_example['title'] = ''

    return modified_example

In [None]:
pubmed = pubmed.map(cleanup, with_indices=True)

In [None]:
pubmed.info

In [None]:
pm = pubmed.take(3000)

In [None]:
p = next(iter(pubmed))

## Writing corpus

In [None]:
import json
from tqdm import tqdm

# Creating a folder to store the generated queries.
if not os.path.exists('corpus_data'):
    os.mkdir('corpus_data')
data_folder = os.path.join(root_dir, 'corpus_data')

In [None]:
data_folder

In [None]:
import time
pbar = tqdm(total=100)
for i in range(10):
    pbar.update(10)
    time.sleep(.1)

In [None]:
pm = pubmed.take(10)

In [None]:
def load_batches(generator, batch_size):
    buf = []
    id = 0
    pbar = tqdm(total=2340483)
    for example in generator:
        buf.append(example)
        if len(buf) == batch_size:
            write_to_jsonl(buf,id)
            buf = []
            id += 1
            pbar.update(len(buf))
    write_to_jsonl(buf,id)

def write_to_jsonl(batch, id):
    corpus_file = f'corpus{id}.jsonl'
    corpus_file_path = os.path.join(data_folder, corpus_file)
    with open(corpus_file_path, 'w', encoding='utf-8') as f:
        for example in batch:
            json.dump(example,f)
            f.write('\n')
    print(f'\n Done writing {corpus_file}...')

In [None]:
load_batches(pubmed, 10000)

In [None]:
jsonl_files = [os.path.join(data_folder,f'corpus{i}.jsonl') for i in range(100)]

In [None]:
outfile = os.path.join(data_folder, 'corpus1M-1.jsonl')

with open(outfile, 'w') as newfile:
  for f in jsonl_Files:
      with open(f) as infile:
        contents = infile.read()
        newfile.write(contents)
        newfile.write('\n')

In [None]:
import subprocess
from ast import literal_eval

def run(command):
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    out, err = process.communicate()
    print(out.decode('utf-8').strip())

print('# CPU')
run('cat /proc/cpuinfo | egrep -m 1 "^model name"')
run('cat /proc/cpuinfo | egrep -m 1 "^cpu MHz"')
run('cat /proc/cpuinfo | egrep -m 1 "^cpu cores"')

print('# RAM')
run('cat /proc/meminfo | egrep "^MemTotal"')

print('# GPU')
run('lspci | grep VGA')

print('# OS')
run('uname -a')


## Query generation

In [None]:
!pip install datasets -q
!pip install huggingface_hub -q
!pip install transformers -q
!pip install sentencepiece -q
!pip install psutil -q

In [None]:
import json
import time
from tqdm import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
# Defining a query generation model and tokenizer ckpt
model_ckpt ='doc2query/msmarco-t5-base-v1'

# Loading the tokenizer and the q-gen model
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

In [None]:
!nvidia-smi

In [None]:
pm = pubmed.take(10)

In [None]:
pbar = tqdm(iter(pm), total=10)

for e in pbar:
    print('1')

In [None]:
import os
import json
import datasets
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm
import torch, logging

logger = logging.getLogger(__name__)


class QGenModel:
    def __init__(
            self,
            model_path: str,
            gen_prefix: str = "",
            use_fast: bool = True,
            device: str = None
        ):
        logger.info("Loading the tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=use_fast)

        logger.info("Loading the model...")
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

        self.gen_prefix = gen_prefix
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info("Use pytorch device: {}".format(self.device))
        self.model = self.model.to(self.device)

    def gen_query_embeddings(self,examples):

        self.max_length = 64
        self.ques_per_passage = 3

        encodings = self.tokenizer(
            examples['text'],
            padding=True,
            truncation=True,
            max_length = 512,
            return_tensors='pt'
        )

        with torch.no_grad():
            outs = self.model.generate(
                input_ids=encodings['input_ids'].to(self.device),
                do_sample=True,
                max_length=self.max_length,
                #top_k=self.top_k
                #top_p=self.top_p,
                num_return_sequences=self.ques_per_passage
            )

        return {"embeddings": {'outs':outs}}


    def generate(
            self,
            corpus: datasets.iterable_dataset.IterableDataset,
            num_examples:int,
            output_dir: str,
            top_p: int = 0.95,
            top_k: int = 25,
            max_length: int = 64,
            ques_per_passage: int = 3,
            prefix: str = "QGen",
            batch_size: int = 32,
            save: bool = True,
            save_after: int = 10000
        ):
        self.num_examples = num_examples
        self.output_dir = output_dir
        self.top_p = top_p,
        self.top_k = top_k,
        self.max_length = max_length
        self.ques_per_passage = ques_per_passage
        self.query_prefix = prefix

        os.makedirs(self.output_dir, exist_ok = True)

        logger.info("Starting to Generate {} Questions Per Passage using top-p (nucleus) sampling...".format(ques_per_passage))
        logger.info("Params: top_p = {}".format(top_p))
        logger.info("Params: top_k = {}".format(top_k))
        logger.info("Params: max_length = {}".format(max_length))
        logger.info("Params: ques_per_passage = {}".format(ques_per_passage))

        queries = corpus.map(self.gen_query_embeddings, batched=True, batch_size = batch_size, remove_columns=['text', 'title'])

        # Decoding the queries
        queries = queries.map(self.decode_queries, batched=True, batch_size = batch_size, remove_columns=['embeddings'])

        if save == False:
            return queries

        self.save_in_batches(queries, save_after)


    def decode_queries(self,examples):

        decoded_queries = self.tokenizer.batch_decode(
            examples['embeddings']['outs'],
            skip_special_tokens = True
        )
        idx_start = int(examples['_id'])*self.ques_per_passage
        query_ids = [f'{self.query_prefix}{id}' for id in range(idx_start,idx_start+self.ques_per_passage)]
        queries = [{"_id":id, "text":query} for id,query in zip(query_ids,decoded_queries)]

        return {"queries":queries}

    def save_in_batches(
        self,
        queries,
        save_after: int
        ):

        buffer = []
        shard_num = 0
        pbar = tqdm(iter(queries), total = self.num_examples*self.ques_per_passage)

        for example in pbar:
            buffer.append(example)
            if len(buffer) == save_after:
                self.write_to_jsonl(buffer, shard_num)
                buffer = []
                shard_num += 1
        if len(buffer) != 0:
            self.write_to_jsonl(buffer,shard_num)

    def write_to_jsonl(self, buffer, shard_num):

        queries_file = f'queries{shard_num}.jsonl'
        queries_file_path = os.path.join(self.output_dir, queries_file)

        logger.info(f"Saving {len(buffer)*self.ques_per_passage} Generated Queries to {queries_file}...")

        with open(queries_file_path, 'w', encoding='utf-8') as fOut:
            for example in buffer:
                queries = example['queries']

                for line in queries:
                    json.dump(line,fOut)
                    fOut.write('\n')

        logger(f'Done writing {queries_file}...')


In [None]:
m = QGenModel('doc2query/msmarco-t5-base-v1')
p = pm.map(m.gen_query_embeddings, batched=True, batch_size=32)

In [None]:
pas = next(iter(pm))

In [None]:
pas.keys()
len(pas['_id'])

In [None]:
pa = m.gen_query_embeddings(pas)

In [None]:
len(pa['embeddings'])

In [None]:
next(iter(p))

In [None]:
import os
import argparse


def qgen(
    corpus,
    num_examples,
    output_dir,
    generator_name_or_path="doc2query/msmarco-t5-base-v1",
    ques_per_passage=3,
    bsz=32,
    qgen_prefix="QGen",
    save = True,
    save_after = 100000
):
    #### question-generation model loading
    generator = QGenModel(generator_name_or_path)

    #### Query-Generation using Nucleus Sampling (top_k=25, top_p=0.95) ####
    #### https://huggingface.co/blog/how-to-generate
    #### Prefix is required to seperate out synthetic queries and qrels from original
    prefix = qgen_prefix

    #### Generating 3 questions per passage.
    #### Reminder the higher value might produce lots of duplicates
    #### Generate queries per passage from docs in corpus and save them in data_path
    try:
        generator.generate(
            corpus,
            num_examples = num_examples,
            output_dir=output_dir,
            ques_per_passage=ques_per_passage,
            prefix=prefix,
            batch_size=bsz,
            save = save,
            save_after = save_after
        )
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            raise RuntimeError(
                f"CUDA out of memory during query generation "
                f"(queries_per_passage: {ques_per_passage}, batch_size_generation: {bsz}). "
                f"Please try smaller `queries_per_passage` and/or `batch_size_generation`."
            )

In [None]:
query_folder = os.path.join(root_dir,'queries_data')
query_folder

In [None]:
qgen(pm,num_examples=3000,output_dir=query_folder)

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_path", required=True)
    parser.add_argument("--output_dir", required=True)
    args = parser.parse_args()
    qgen(args.data_path, args.output_dir)
