In [None]:
import ir_datasets
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import fasttext
from concurrent.futures import ThreadPoolExecutor
import re
import json
import pickle
import nltk

##### Initialize Preprocessing

In [30]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
language_model = fasttext.load_model('lid.176.ftz')

word_pattern = re.compile(r"\b[\w']+\b")
bm25_clean = re.compile(r'[^\w]')
biobert_clean = re.compile(r'[^\w\-\'\.]')
multi_space = re.compile(r'\s+')

##### Cleaning

In [32]:
def regex_tokenize(text):
    return word_pattern.findall(text.lower())

def process_bm25(text):
    tokens = [ps.stem(t) for t in regex_tokenize(text)
             if t not in stop_words and len(t) > 2]  
    return ' '.join(tokens)

def process_biobert(text):
    text = biobert_clean.sub(' ', text.lower())
    return multi_space.sub(' ', text).strip()

def is_english(text):
    if not text: return False
    try:
        return language_model.predict(text[:500], k=1)[0][0] == '__label__en'
    except:
        return False

##### Main Preprocessing Pipeline

In [33]:
def process_document(doc):
    sample_text = (doc.abstract or '') + ' '.join(s.text for s in doc.body[:2])
    if not is_english(sample_text):
        return None

    processed = {
        'doc_id': doc.doc_id,
        'original_title': doc.title.strip(),
        'bm25': {
            'title': process_bm25(doc.title),
            'abstract': process_bm25(doc.abstract) if doc.abstract else '',
            'body': ' '.join(process_bm25(s.text) for s in doc.body)
        },
        'biobert': {
            'title': process_biobert(doc.title),
            'abstract': process_biobert(doc.abstract) if doc.abstract else '',
            'body': ' '.join(process_biobert(s.text) for s in doc.body)
        }
    }

    for model in ['bm25', 'biobert']:
        combined = ' '.join([
            processed[model]['title'],
            processed[model]['abstract'],
            processed[model]['body']
        ])
        processed[model]['combined'] = multi_space.sub(' ', combined).strip()

    return processed

def main():
    dataset = ir_datasets.load('cord19/fulltext/trec-covid')
    
    with open('preprocessed_cord19.jsonl', 'w') as f:
        for doc in dataset.docs_iter():
            result = process_document(doc)
            if result:
                f.write(json.dumps(result) + '\n')

if __name__ == '__main__':
    main()

In [None]:
with open("preprocessed_cord19.jsonl", "r") as f:
    for i, line in enumerate(f):
        if i > 10: break 
        obj = json.loads(line)
        print(f"[{i}] {obj}")

[0] {'doc_id': 'ug7v899j', 'original_title': 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', 'bm25': {'title': 'clinic featur cultur proven mycoplasma pneumonia infect king abdulaziz univers hospit jeddah saudi arabia', 'abstract': 'object retrospect chart review describ epidemiolog clinic featur patient cultur proven mycoplasma pneumonia infect king abdulaziz univers hospit jeddah saudi arabia method patient posit pneumonia cultur respiratori specimen januari 1997 decemb 1998 identifi microbiolog record chart patient review result patient identifi requir admiss infect commun acquir infect affect age group common infant pre school children occur year round common fall spring three quarter patient comorbid twenti four isol associ pneumonia upper respiratori tract infect bronchiol cough fever malais common symptom crepit wheez common sign patient pneumonia crepit bronchial breath immunocompromis patient l