In [1]:
import ir_datasets
import pandas as pd
import re 
from transformers import AutoTokenizer
import hashlib
import json
import pickle

##### Load the Dataset

In [2]:
dataset = ir_datasets.load('cord19/fulltext/trec-covid')
docs_iter = dataset.docs_iter()
bio_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

##### Text Cleaning

In [3]:
def combine_doc_text(doc):
    title_str = doc.title or ""
    abstract_str = doc.abstract or ""
    body_str = ""
    if doc.body:
        if isinstance(doc.body, list):
            sections = [section.text for section in doc.body if section.text]
            body_str = " ".join(sections)
        else:
            body_str = doc.body.text if getattr(doc.body, 'text', None) else ""
    full_text = " ".join([title_str, abstract_str, body_str])
    return full_text.strip()

def clean_text(text):
    text = re.sub(r'\[[0-9]{1,3}\]', '', text)
    text = re.sub(r'http\S+', '', text)         
    text = re.sub(r'\s+', ' ', text)  
    text = text.lower()          
    return text.strip()

##### Chunk Long Documents

In [4]:
def chunk_for_biobert(text: str, tokenizer, max_length=512):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0
    while start < len(tokens):
        chunk_tokens = tokens[start : start + max_length - 2]
        encoded = tokenizer.encode_plus(
            chunk_tokens,
            truncation=True,
            max_length=max_length,
            return_tensors='pt',
            add_special_tokens=True,
            is_split_into_words=True,
        )
        chunks.append(encoded)
        start += (max_length - 2)
    return chunks

##### Check Duplicates

In [5]:
def deduplicate_text(text_hash_set, text):
    text_hash = hashlib.md5(text.encode('utf-8')).hexdigest()
    
    if text_hash in text_hash_set:
        return False 
    else:
        text_hash_set.add(text_hash)
        return True  

##### Main Preprocessing Pipeline

In [6]:
def preprocess_dataset(dataset, tokenizer, apply_dedup=False):
  
    text_hash_set = set()  
    records = []

    for doc in dataset.docs_iter():
        raw_text = combine_doc_text(doc)
        if not raw_text:
            continue

        cleaned = clean_text(raw_text)

        if apply_dedup:
            if not deduplicate_text(text_hash_set, cleaned):
                continue
        
        biobert_chunks = chunk_for_biobert(cleaned, tokenizer, max_length=512)

        records.append({
            'doc_id': doc.doc_id,
            'cleaned_text': cleaned,
            'biobert_encoded_chunks': biobert_chunks
        })

    return records

In [7]:
processed_records = preprocess_dataset(dataset, bio_tokenizer,  
                                       apply_dedup=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (651 > 512). Running this sequence through the model will result in indexing errors


##### Dataset Storage

In [9]:
records_for_classical_ir = [
    {
        "doc_id": rec["doc_id"],
        "cleaned_text": rec["cleaned_text"]
    }
    for rec in processed_records
]
with open("trec_covid_preprocessed_minimal.json", "w", encoding="utf-8") as f:
    json.dump(records_for_classical_ir, f, ensure_ascii=False)



with open("trec_covid_preprocessed_full.pkl", "wb") as f:
    pickle.dump(processed_records, f)
