1.Dataset Familiarization and Preprocessing

In [1]:
pip install pandas numpy faiss-cpu rank-bm25 sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.9/275.9 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rank-bm25, faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.10.0 rank-bm25-0.2.2 sentence-transformers-3.4.1


In [2]:
pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (63.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86


In [None]:
import pandas as pd
import json
import re
import torch
from sentence_transformers import SentenceTransformer

# Load Data
diagnosis_df = pd.read_csv("diagnosis_data.csv")
medical_df = pd.read_csv("medical_dataset.csv")

# Handle Missing Values
diagnosis_df.fillna("{}", inplace=True)  # Replace NaNs with empty dictionaries
medical_df.fillna("", inplace=True)  # Replace NaNs with empty strings

# Parse JSON-like fields in diagnosis_data.csv
diagnosis_df['diagnostic'] = diagnosis_df['diagnostic'].apply(lambda x: json.loads(x) if isinstance(x, str) else {})
diagnosis_df['knowledge'] = diagnosis_df['knowledge'].apply(lambda x: json.loads(x) if isinstance(x, str) else {})

# Combine Relevant Text Fields in medical_dataset.csv
medical_df['combined_text'] = medical_df[['input1', 'input2', 'input3', 'input4', 'input5', 'input6']].agg(' '.join, axis=1)
medical_df.drop(columns=['input1', 'input2', 'input3', 'input4', 'input5', 'input6'], inplace=True)

# Text Cleaning & Tokenization Function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text.strip()

# Apply Tokenization
medical_df['processed_text'] = medical_df['combined_text'].apply(preprocess_text)

# Load Sentence Transformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate Embeddings
medical_df['embedding'] = medical_df['processed_text'].apply(lambda x: model.encode(x, convert_to_tensor=True))

# Save Preprocessed Data (Optional)
medical_df.to_pickle("preprocessed_medical_data.pkl")  # Save as a binary file for efficiency
diagnosis_df.to_pickle("preprocessed_diagnosis_data.pkl")

print("Preprocessing completed successfully!")


Preprocessing completed successfully!


2.Designing the Retrieval Component


In [None]:
import pandas as pd

# Load Data
medical_df = pd.read_pickle("preprocessed_medical_data.pkl")
diagnosis_df = pd.read_pickle("preprocessed_diagnosis_data.pkl")

# Print Available Columns
print("🔹 Medical Dataset Columns:", medical_df.columns.tolist())
print("🔹 Diagnosis Dataset Columns:", diagnosis_df.columns.tolist())


🔹 Medical Dataset Columns: ['diagnosis', 'combined_text', 'processed_text', 'embedding']
🔹 Diagnosis Dataset Columns: ['diagnosis_label', 'diagnostic', 'knowledge']


In [3]:
import pandas as pd
import faiss
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

# ✅ Load Data
medical_df = pd.read_pickle("preprocessed_medical_data.pkl")
diagnosis_df = pd.read_pickle("preprocessed_diagnosis_data.pkl")

# ✅ Rename 'diagnosis_label' to 'diagnosis' in diagnosis_df
diagnosis_df.rename(columns={'diagnosis_label': 'diagnosis'}, inplace=True)

# ✅ Merge on 'diagnosis'
merged_df = medical_df.merge(diagnosis_df, on="diagnosis", how="left")

# ✅ Fill Missing Values
merged_df.fillna("", inplace=True)

# ✅ Combine Relevant Information for Retrieval
merged_df['combined_text'] = merged_df[['diagnosis', 'diagnostic', 'knowledge']].astype(str).agg(' '.join, axis=1)

# ✅ Tokenize for BM25
bm25_corpus = [text.split() for text in merged_df['combined_text']]
bm25 = BM25Okapi(bm25_corpus)

# ✅ Load Dense Embedding Model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# ✅ Convert Text Data to Embeddings
embeddings = np.array([embedding_model.encode(text, convert_to_tensor=False) for text in merged_df['combined_text']])

# ✅ FAISS Index for Efficient Dense Retrieval
d = embeddings.shape[1]  # Embedding dimension
faiss_index = faiss.IndexFlatL2(d)
faiss_index.add(embeddings)

# 🔹 **Hybrid Retrieval Function**
def retrieve_documents(query, top_n=5):
    query_tokens = query.lower().split()  # Tokenized query for BM25
    query_embedding = embedding_model.encode(query, convert_to_tensor=False).reshape(1, -1)

    # ✅ BM25 Retrieval
    bm25_scores = bm25.get_scores(query_tokens)
    bm25_top_n = np.argsort(bm25_scores)[::-1][:top_n]

    # ✅ FAISS Dense Retrieval
    _, faiss_top_n = faiss_index.search(query_embedding, top_n)

    # ✅ Combine Results
    retrieved_docs = set(bm25_top_n) | set(faiss_top_n[0])

    # ✅ Extracting Information from Retrieved Docs
    retrieved_data = merged_df.iloc[list(retrieved_docs)]

    return retrieved_data[['diagnosis', 'diagnostic', 'knowledge']]  # Using available columns

# 🔹 Example Query
query = "Diabetic patient with foot pain and numbness"
retrieved_results = retrieve_documents(query)

print("🔹 Retrieved Results:")
print(retrieved_results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔹 Retrieved Results:
        diagnosis diagnostic knowledge
0    Hypertension                     
261      Diabetes                     
262      Diabetes                     
263      Diabetes                     
264      Diabetes                     
265      Diabetes                     
507        Asthma                     
508        Asthma                     
509        Asthma                     
510        Asthma                     


3. Integrating the Generative Model

In [5]:
!pip install sacremoses


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/897.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [6]:
# ✅ Import Necessary Libraries
import pandas as pd
import faiss
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer

# ✅ Load Medical LLM (BioGPT-Large)
model_name = "microsoft/BioGPT-Large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = AutoModelForCausalLM.from_pretrained(model_name)

# ✅ Load Preprocessed Data
medical_df = pd.read_pickle("preprocessed_medical_data.pkl")
diagnosis_df = pd.read_pickle("preprocessed_diagnosis_data.pkl")

# ✅ Combine Available Information
medical_df['combined_text'] = medical_df[['diagnosis', 'combined_text']].astype(str).agg(' '.join, axis=1)

# ✅ Tokenize for BM25
bm25_corpus = [text.split() for text in medical_df['combined_text']]
bm25 = BM25Okapi(bm25_corpus)

# ✅ Load Dense Embedding Model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# ✅ Convert Text Data to Embeddings
embeddings = np.array([embedding_model.encode(text, convert_to_tensor=False) for text in medical_df['combined_text']])

# ✅ FAISS Index for Efficient Dense Retrieval
d = embeddings.shape[1]  # Embedding dimension
faiss_index = faiss.IndexFlatL2(d)
faiss_index.add(embeddings)

# ✅ Hybrid Retrieval Function
def retrieve_documents(query, top_n=5):
    query_tokens = query.lower().split()  # Tokenized query for BM25
    query_embedding = embedding_model.encode(query, convert_to_tensor=False).reshape(1, -1)

    # ✅ BM25 Retrieval
    bm25_scores = bm25.get_scores(query_tokens)
    bm25_top_n = np.argsort(bm25_scores)[::-1][:top_n]

    # ✅ FAISS Dense Retrieval
    _, faiss_top_n = faiss_index.search(query_embedding, top_n)

    # ✅ Combine Results
    retrieved_docs = set(bm25_top_n) | set(faiss_top_n[0])

    # ✅ Extracting Information from Retrieved Docs
    retrieved_data = medical_df.iloc[list(retrieved_docs)]

    return retrieved_data[['diagnosis', 'combined_text']]  # Using available columns

# ✅ Generate Structured Medical Report
def generate_medical_summary(user_query, retrieved_docs):
    prompt = f"""
    You are a medical AI assistant providing structured reports based on retrieved medical records.
    Given the following information, generate a structured summary.

    **User Query:** {user_query}

    **Retrieved Medical Records:**
    {retrieved_docs.to_string(index=False)}

    **Structured Report:**
    - **Diagnosis:** (Extract from retrieved records)
    - **Symptoms:** (Extract from combined_text)
    - **Medical Details:** (Extract relevant knowledge)
    - **Treatment & Cure:** (Infer based on medical details)
    - **Physical Examination Findings:** (If available, extract from records)

    Generate a professional and well-structured report based on the retrieved information.
    """

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    output = generator.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.7)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Example Query
query = "Patient experiencing persistent headaches and dizziness"

# ✅ Retrieve Relevant Records
retrieved_results = retrieve_documents(query)

# ✅ Generate Structured Summary
summary = generate_medical_summary(query, retrieved_results)

# ✅ Display Results
print("🔹 **Generated Medical Report:**")
print(summary)


config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.29G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.28G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

🔹 **Generated Medical Report:**
You are a medical AI assistant providing structured reports based on retrieved medical records. Given the following information, generate a structured summary. * * User Query: * * Patient experiencing persistent headaches and dizziness * * Retrieved Medical Records: * * diagnosis combined _ text Stroke Stroke dizzyness\ n He is a pleasant male with no significant past medical history. On evening, the patient became dizzy and the room was spinning when he arose from sitting. he described a right frontal headache located primarily behind his eye. He didn't think much of it and went to bed. On the following morning, the patient ate breakfast and felt fairly normal. Shortly thereafter, he again felt dizzy (room spinning), became nauseated and vomited. He laid down on the floor to try and relax. Because the patient didn't feel better, he called his son to take him to hospital. Since that time, he complains of an intermittent headache located in the posterior 