In [1]:
from rank_bm25 import BM25Okapi
import json
import pandas as pd

bm25_docs = []
doc_metadata = []
with open('preprocessed_cord19.jsonl') as f:
    for line in f:
        doc = json.loads(line)
        bm25_docs.append(doc['bm25']['combined'].split())  
        doc_metadata.append({
            'doc_id': doc['doc_id'],
            'biobert_text': doc['biobert']['combined'],
            'original_title': doc['original_title']
        })

bm25 = BM25Okapi(bm25_docs) 

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.preprocessing import normalize

device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device.upper()}")

tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed")
model = AutoModel.from_pretrained("monologg/biobert_v1.1_pubmed").to(device)

def get_embeddings(texts, batch_size=8):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch, 
            padding=True, 
            truncation=True, 
            max_length=512, 
            return_tensors="pt"
        ).to(model.device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())
    
    return normalize(np.concatenate(embeddings), axis=1)

def hybrid_search(query, top_k=5, first_stage_n=100):
    tokenized_query = query.lower().split()  
    doc_scores = bm25.get_scores(tokenized_query)
    top_indices = np.argsort(doc_scores)[-first_stage_n:][::-1]
    
    biobert_texts = [doc_metadata[idx]['biobert_text'] for idx in top_indices]
    query_input = tokenizer(
        query, 
        padding=True, 
        truncation=True, 
        max_length=512, 
        return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
        query_embed = model(**query_input).last_hidden_state[:,0,:].cpu().numpy()
    
    doc_embeds = get_embeddings(biobert_texts)
    biobert_scores = np.dot(query_embed, doc_embeds.T).flatten()
    
    combined_scores = 0.7*doc_scores[top_indices] + 0.3*biobert_scores
    final_indices = top_indices[np.argsort(combined_scores)[::-1][:top_k]]
    
    return final_indices

tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [9]:
# Interactive terminal loop
print(" BM25 Search Tool (type 'exit' to quit)\n")

while True:
    query = input("Enter your search query: ")
    if query.lower() == "exit":
        print("Exiting...")
        break

    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)

    # Get top 5 results
    import numpy as np
    top_n = np.argsort(scores)[::-1][:5]

    print("\n Top 5 Relevant Documents:\n")
    for idx in top_n:
        print(f"Doc ID: {df.iloc[idx]['doc_id']}")
        print(f"Score: {scores[idx]:.4f}")
        print(f"Snippet: {df.iloc[idx]['cleaned_text'][:300]}...\n")
    print("––––––––––––––––––––––––––––––––––––––\n")

 BM25 Search Tool (type 'exit' to quit)



Enter your search query:  effectiveness of face masks against covid



📄 Top 5 Relevant Documents:

Doc ID: tcijnphu
Score: 28.7860
Snippet: effectiveness of cloth masks for protection against severe acute respiratory syndrome coronavirus 2 cloth masks have been used in healthcare and community settings to protect the wearer from respiratory infections. the use of cloth masks during the coronavirus disease (covid-19) pandemic is under de...

Doc ID: xtraspw2
Score: 28.7860
Snippet: effectiveness of cloth masks for protection against severe acute respiratory syndrome coronavirus 2. cloth masks have been used in healthcare and community settings to protect the wearer from respiratory infections. the use of cloth masks during the coronavirus disease (covid-19) pandemic is under d...

Doc ID: ycduncjb
Score: 28.3707
Snippet: disrupting the transmission of influenza a: face masks and ultraviolet light as control measures. in the event of an influenza pandemic, where effective vaccine and antiviral drugs may be lacking, disrupting environmental transmission of

Enter your search query:  symptoms of covid



📄 Top 5 Relevant Documents:

Doc ID: u54kja4g
Score: 19.5744
Snippet: determinants of self-reported symptoms and testing for covid-19 in canada using a nationally representative survey in april 2020, a nationally representative sample of 4, 240 canadians age 18 years and older were polled about covid experience in march, early in the epidemic. we examined determinants...

Doc ID: bzs8qag4
Score: 18.1416
Snippet: informed consent for emergency obstetric care during covid-19 pandemic informed consent process has become a challenging issue before surgery for any emergency obstetric care during this covid pandemic. there is an increased risk of morbidity if there is a need of intensive care unit postoperatively...

Doc ID: avm9lzjq
Score: 17.6554
Snippet: atypical covid -19 presentation in patient undergoing staged taaa repair this report outlines a case of atypical presentation of covid 19 viral infection. a 65-year old male was planned for a two staged repair of a crawford type 3 thorac

Enter your search query:  exit


Exiting...
