## Knowledge Graphs (diagnostic_kg) → JSON files containing diagnostic pathways and medical knowledge.


In [1]:
import json
import os
kg_path = "/kaggle/input/medical-dataset/diagnostic_kg/Diagnosis_flowchart"
knowledge_graphs = {}
for file in os.listdir(kg_path):
    if file.endswith(".json"):
        with open(os.path.join(kg_path, file), "r", encoding="utf-8") as f:
            data = json.load(f)
        diagnostic_steps = data.get("diagnostic", {})
        knowledge_info = data.get("knowledge", {})
        knowledge_text = ""
        for step, details in knowledge_info.items():
            if isinstance(details, dict):  
                for key, value in details.items():
                    knowledge_text += f"{step} - {key}: {value}\n"
            else:  
                knowledge_text += f"{step}: {details}\n"
        knowledge_graphs[file] = knowledge_text

## Annotated Clinical Notes (samples) → JSON files with real patient records and step-by-step diagnoses.

In [2]:
import os
import json

# Path to clinical notes
sample_path = "/kaggle/input/medical-dataset/samples/Finished"

# Dictionary to store clinical notes
clinical_notes = {}

# Loop through disease categories inside 'Finished'
for disease_category in os.listdir(sample_path):
    disease_path = os.path.join(sample_path, disease_category)
    
    if os.path.isdir(disease_path):  # Ensure it's a folder
        clinical_notes[disease_category] = {}
        
        # Loop through subcategories inside each disease category
        for subcategory in os.listdir(disease_path):
            subcategory_path = os.path.join(disease_path, subcategory)
            
            if os.path.isdir(subcategory_path):  # Ensure it's a folder
                clinical_notes[disease_category][subcategory] = []
                
                # Traverse JSON files inside subcategory
                for file in os.listdir(subcategory_path):
                    if file.endswith(".json"):
                        file_path = os.path.join(subcategory_path, file)
                        
                        with open(file_path, "r", encoding="utf-8") as f:
                            data = json.load(f)
                        
                        # Extract useful fields
                        note_text = ""
                        for key, value in data.items():
                            note_text += f"{key}: {value}\n"
                        
                        # Store extracted text
                        clinical_notes[disease_category][subcategory].append(note_text)

# Now 'clinical_notes' contains structured data organized by disease category and subcategory


## Clean and Preprocess the Text

In [3]:
import re

def clean_text(text):
    """
    Cleans and normalizes medical text.
    - Removes special characters
    - Normalizes spacing
    - Converts lists into readable format
    """
    text = re.sub(r'\n+', '\n', text)  # Remove extra new lines
    text = re.sub(r'[^\w\s.;,]', '', text)  # Remove special characters
    text = text.strip()
    return text

def clean_nested_data(data):
    """
    Recursively applies cleaning to nested data (text content).
    """
    if isinstance(data, str):  # If the data is a string, clean it
        return clean_text(data)
    
    if isinstance(data, dict):  # If the data is a dictionary, apply cleaning recursively
        return {key: clean_nested_data(value) for key, value in data.items()}
    
    if isinstance(data, list):  # If the data is a list, apply cleaning recursively to each item
        return [clean_nested_data(item) for item in data]
    
    return data  # If it's neither string, dict, nor list, return as is

# Apply cleaning to both knowledge graphs and clinical notes
knowledge_graphs = {key: clean_text(value) for key, value in knowledge_graphs.items()}
clinical_notes = clean_nested_data(clinical_notes)  # Apply the recursive cleaning for nested structure


## Graph to Document (Format the Data for Retrieval)

In [4]:
from langchain.docstore.document import Document

# Prepare documents for retrieval
documents = []

# Convert Knowledge Graphs
for file, content in knowledge_graphs.items():
    documents.append(Document(page_content=content, metadata={"source": file, "type": "knowledge_graph"}))

# Convert Clinical Notes (Handling Nested Structure)
for disease_category, subcategories in clinical_notes.items():
    for subcategory, notes in subcategories.items():
        for idx, note in enumerate(notes):  # Each note is a separate document
            documents.append(Document(
                page_content=note,
                metadata={
                    "source": f"{disease_category}/{subcategory}/note_{idx+1}",
                    "type": "clinical_note",
                    "disease_category": disease_category,
                    "subcategory": subcategory
                }
            ))


## Storeing the Data in a Vector Database

In [5]:
!pip install faiss-cpu langchain sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Installing collected packages: async-timeout, faiss-cpu
  Attempting uninstall: async-timeout
    Found existing installation: async-timeout 5.0.1
    Uninstalling async-timeout-5.0.1:
      Successfully uninstalled async-timeout-5.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.

In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.45 (from langchain-community)
  Downloading langchain_core-0.3.50-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.21 (from langchain-community)
  Downloading langchain-0.3.22-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.7 (from langchain<1.0.0,>=0.3.21->langchain-community)
  Downloading langchain_text_splitters-0.3.7-py3-none-any.whl.metadata (1.9 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading

In [None]:
!pip install --upgrade huggingface_hub
!pip install --upgrade sentence-transformers
!pip install --upgrade bitsandbytes
!pip install --upgrade transformers accelerate
!pip install --upgrade transformers bitsandbytes accelerate sentence-transformers


In [None]:
!pip install --upgrade transformers bitsandbytes accelerate sentence-transformers

In [None]:
# from langchain.embeddings import HuggingFaceEmbeddings

# # Load sentence transformer model
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [None]:
import torch
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

# Verify CUDA and GPU
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU Device: {torch.cuda.get_device_name(0)}")

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# Load the model manually with quantization
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")

# Wrap it into SentenceTransformer
embedding_model = SentenceTransformer(model_name)
embedding_model._first_module().auto_model = model  # Inject the quantized model

# Initialize embeddings
embedding = HuggingFaceEmbeddings(model_name=model_name)

print("Model loaded successfully!")


### Convert Documents into Embeddings

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# Initialize the embedding model with BERT base uncased
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # 384-dim model

# Convert dictionaries to LangChain Documents
documents = []

# Knowledge Graphs
for file, content in knowledge_graphs.items():
    documents.append(Document(page_content=content, metadata={"source": file, "type": "knowledge_graph"}))

# Clinical Notes
for disease, subcategories in clinical_notes.items():
    for subcategory, notes in subcategories.items():
        for note in notes:
            documents.append(Document(page_content=note, metadata={"source": disease, "subcategory": subcategory, "type": "clinical_note"}))
faiss_index = FAISS.from_documents(documents, embedding_model)
faiss_index.save_local("faiss_index")


In [None]:
vectorstore = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

## Implementing the top-k simimlarty to get the results

## Load the FAISS Vector Store

In [None]:
from huggingface_hub import login
login(token="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")


In [None]:
!pip install --upgrade langchain


In [None]:
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [None]:
# import os
# os.environ['TRANSFORMERS_CACHE'] = '/path/to/cache/directory'
# os.environ['BITSANDBYTES_CACHING'] = '1'

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import bitsandbytes as bnb

# Verify CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"BitsAndBytes version: {bnb.__version__}")

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

if torch.cuda.is_available():
    # GPU configuration with 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=quantization_config,
        trust_remote_code=True
    )
else:
    # If no GPU, try loading with 8-bit quantization or regular loading
    print("No GPU detected. Loading model with basic configuration...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float32,
            trust_remote_code=True
        )
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        # If that fails, try with a smaller model
        print("Consider using a smaller model or enabling GPU runtime")
        raise

# Load pipeline
qa_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

In [None]:
def answer_clinical_query(query):
    docs = retriever.invoke(query)  # Fix deprecated method
    context = "\n".join([doc.page_content for doc in docs])
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    
    # Fix `max_length` error
    response = qa_pipeline(prompt, max_new_tokens=256, num_return_sequences=1)
    
    return response[0]['generated_text']


In [None]:
query = "what is CT scan"
response = answer_clinical_query(query)
print(response)

In [None]:
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
# from langchain.docstore.document import Document
# from transformers import pipeline

# # Initialize the embedding model
# embedding_model = HuggingFaceEmbeddings(model_name="bert-base-uncased")

# # Convert dictionaries to LangChain Documents
# documents = []

# # Knowledge Graphs
# for file, content in knowledge_graphs.items():
#     documents.append(Document(page_content=content, metadata={"source": file, "type": "knowledge_graph"}))

# # Clinical Notes
# for disease, subcategories in clinical_notes.items():
#     for subcategory, notes in subcategories.items():
#         for note in notes:
#             documents.append(Document(page_content=note, metadata={"source": disease, "subcategory": subcategory, "type": "clinical_note"}))

# # Initialize the FAISS Vector Store (make sure the embedding model is the same as the one used for retrieval)
# faiss_index = FAISS.from_documents(documents, embedding_model)

# # Retrieve the retriever
# retriever = faiss_index.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# # Initialize the QA pipeline
# qa_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2", device=0)

# # Function to answer clinical query
# def answer_clinical_query(query):
#     # Retrieve relevant documents
#     docs = retriever.get_relevant_documents(query)
    
#     # Combine retrieved knowledge
#     context = "\n".join([doc.page_content for doc in docs])
    
#     # Generate an answer using BERT-based model
#     prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
#     response = qa_pipeline(prompt, max_length=256, num_return_sequences=1)
    
#     return response[0]['generated_text']

# # Test query
# query = "What are the symptoms of heart failure?"
# response = answer_clinical_query(query)
# print(response)
