In [None]:
!pip install biopython tqdm

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/3.3 MB[0m [31m41.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from Bio import Entrez
from tqdm import tqdm
import time
import json

# Set your email here (required by NCBI)
Entrez.email = "Shajahansayyed001@gmail.com"

def fetch_pubmed_articles(query, max_results=50):
    # Search PubMed
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    id_list = record["IdList"]
    print(f"Found {len(id_list)} articles.")

    articles = []

    # Batch fetch details
    for pmid in tqdm(id_list):
        try:
            handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="xml")
            records = Entrez.read(handle)
            handle.close()

            for article in records["PubmedArticle"]:
                title = article["MedlineCitation"]["Article"]["ArticleTitle"]
                abstract = article["MedlineCitation"]["Article"].get("Abstract", {}).get("AbstractText", [""])[0]

                articles.append({
                    "pmid": pmid,
                    "title": title,
                    "abstract": abstract,
                })

            time.sleep(0.5)  # Avoid hitting the API too hard
        except Exception as e:
            print(f"Error fetching {pmid}: {e}")

    return articles

# Save to file
def save_articles(articles, filename="pubmed_results.json"):
    with open(filename, "w") as f:
        json.dump(articles, f, indent=2)

# Example usage
if __name__ == "__main__":
    query = "weight loss night sweats persistent cough"
    results = fetch_pubmed_articles(query, max_results=50)
    save_articles(results)
    print(f"Saved {len(results)} articles.")

Found 50 articles.


100%|██████████| 50/50 [00:44<00:00,  1.13it/s]

Saved 49 articles.





In [None]:
!pip install sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentele

In [None]:
import json

def load_articles(filename="/content/pubmed_results.json"):
    with open(filename, "r") as f:
        articles = json.load(f)
    return articles

articles = load_articles()


In [None]:
def chunk_articles(articles):
    chunks = []
    metadata = []
    for article in articles:
        text = article["abstract"]
        if not text.strip():
            continue
        chunks.append(text)
        metadata.append({
            "pmid": article["pmid"],
            "title": article["title"]
        })
    return chunks, metadata

chunks, metadata = chunk_articles(articles)
print(f"Total chunks: {len(chunks)}")

Total chunks: 49


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb", token="")
embeddings = model.encode(chunks, show_progress_bar=True)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import faiss
import numpy as np
import pickle

embedding_matrix = np.array(embeddings)

# Create FAISS index
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embedding_matrix)

# Save index + metadata
faiss.write_index(index, "pubmed_faiss.index")
with open("pubmed_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

print("Vector index and metadata saved!")


Vector index and metadata saved!


In [None]:
import faiss
import pickle
from sentence_transformers import SentenceTransformer

# Load index
index = faiss.read_index("pubmed_faiss.index")

# Load metadata
with open("pubmed_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

In [None]:
def search_similar_chunks(query, k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, k)

    results = []
    for idx in indices[0]:
        if idx < len(metadata):
            result = {
                "title": metadata[idx]["title"],
                "text": chunks[idx]
            }
            results.append(result)
    return results

In [None]:
query = "What are the possible diagnoses for a patient with unexplained weight loss, night sweats, and persistent cough?"
docs = search_similar_chunks(query)

for i, doc in enumerate(docs):
    print(f"\n--- Document {i+1} ---")
    print(f"Title: {doc['title']}")
    print(f"Text: {doc['text'][:300]}...")



--- Document 1 ---
Title: Intractable Cough Associated With Renal Cell Carcinoma.
Text: Renal cell carcinoma (RCC) is known to cause abdominal pain, hematuria, flank pain, fevers, night sweats, and weight loss, but its association with paraneoplastic syndromes such as intractable cough is rare. Here, we present the case of an 86-year-old female who presented with a persistent dry cough...

--- Document 2 ---
Title: <i>Streptococcus mutans</i> endocarditis resulting in severe aortic and mitral valve dysfunction and congestive heart failure.
Text: A patient in his 40s with no known cardiac history presented to the emergency department with midsternal chest pain worse on inspiration for the past 1 week. He also complains of recent weight loss, dry cough and night sweats during this time. He describes significant dental pain as well. Electrocar...

--- Document 3 ---
Title: A 70-Year-Old Man With Cough and Recurrent Respiratory Infections.
Text: A 70-year-old man was referred for evaluati

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import InferenceClient

client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1", token="") # Change this to llama, deepseek, etc.
query = "What are the possible diagnoses for a patient who is 8 years old with unexplained weight loss, night sweats, and persistent cough?"
def generate_answer(query, docs):
    context = "\n\n".join([doc["text"] for doc in docs])

    prompt = f"""You are a medical assistant.
                Use the following medical literature to suggest possible diagnoses.

                Question:
                {query}

                Context:
                {context}

                Based on the above, provide:
                1. Top 3 differential diagnoses
                2. Supporting evidence from retrieved documents
                3. Suggested next diagnostic steps

            """

    response = client.text_generation(prompt, max_new_tokens=500, temperature=0.5)
    return response

# Try it
response = generate_answer(query, docs)
print("\n📋 Final Answer:\n", response)



📋 Final Answer:
 1. Top 3 differential diagnoses
            a. Renal cell carcinoma (RCC)
            b. Paraneoplastic syndromes
            c. Cardiac disease (valvular regurgitation, aortic dissection)

            2. Supporting evidence from retrieved documents
            a. A patient who is 8 years old with unexplained weight loss, night sweats, and persistent cough, and who has been diagnosed with RCC.
            b. A patient who is 70 years old with recurrent respiratory infections requiring antibiotics and chronic cough, and who has been diagnosed with alpha-1 antitrypsin deficiency.
            c. A patient who is 54 years old with subacute onset of chest pain, shortness of breath, productive cough with haemoptysis, and night sweats, and who has been diagnosed with acute COPD exacerbation.

            3. Suggested next diagnostic steps
            a. For the patient with RCC, further imaging and biopsy may be necessary to assess the extent of the disease and to determine 