# Medical Data Ingestion Notebook

This notebook demonstrates how to load and preprocess medical data from various sources for the RAG system.

## Overview
- Load Kaggle Disease Dataset
- Process symptoms and diseases
- Create vector embeddings
- Store in vector database
- Test data retrieval


In [None]:
# Import required libraries
import sys
import os
sys.path.append('../')

import pandas as pd
import numpy as np
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## 1. Load Kaggle Disease Dataset


In [None]:
# Load the main dataset
dataset_path = Path("../data/raw/Disease Dataset/dataset.csv")
descriptions_path = Path("../data/raw/Disease Dataset/symptom_Description.csv")
precautions_path = Path("../data/raw/Disease Dataset/symptom_precaution.csv")

print("Loading Kaggle Disease Dataset...")

# Load main dataset
df = pd.read_csv(dataset_path)
print(f"Main dataset loaded: {len(df)} records")

# Load descriptions
descriptions_df = pd.read_csv(descriptions_path)
print(f"Descriptions loaded: {len(descriptions_df)} diseases")

# Load precautions
precautions_df = pd.read_csv(precautions_path)
print(f"Precautions loaded: {len(precautions_df)} diseases")

# Merge datasets
df_merged = df.merge(descriptions_df, on='Disease', how='left')
df_merged = df_merged.merge(precautions_df, on='Disease', how='left')

print(f"Final merged dataset: {len(df_merged)} records with {len(df_merged.columns)} columns")
df_merged.head()


## 2. Data Analysis and Visualization


In [None]:
# Analyze the dataset
print("=== Dataset Analysis ===")
print(f"Total records: {len(df_merged)}")
print(f"Unique diseases: {df_merged['Disease'].nunique()}")

# Get all symptoms
all_symptoms = set()
for col in [f'Symptom_{i}' for i in range(1, 18)]:
    symptoms = df_merged[col].dropna().unique()
    all_symptoms.update(symptoms)

print(f"Unique symptoms: {len(all_symptoms)}")

# Disease distribution
disease_counts = df_merged['Disease'].value_counts()
print(f"\nTop 10 diseases by frequency:")
print(disease_counts.head(10))


## 4. Store in Vector Database


In [None]:
# Initialize ChromaDB
client = chromadb.PersistentClient(path="../data/chroma_db")
collection = client.get_or_create_collection(
    name="medical_knowledge",
    metadata={"description": "Medical disease and symptom knowledge base"}
)

print("ChromaDB collection created/loaded")

# Add documents to collection
ids = [f"disease_{i}" for i in range(len(disease_texts))]
metadatas = [{"disease": df_merged.iloc[i]['Disease']} for i in range(len(disease_texts))]

collection.add(
    documents=disease_texts,
    embeddings=disease_embeddings,
    ids=ids,
    metadatas=metadatas
)

print(f"Added {len(disease_texts)} documents to vector database")


## 5. Test Data Retrieval


In [None]:
# Test query: Find diseases with fever and cough
query = "I have fever and cough, what disease could this be?"
query_embedding = model.encode(query)

results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=5
)

print("Query:", query)
print("\nTop 5 matching diseases:")
for i, (doc, metadata, distance) in enumerate(zip(results['documents'][0], results['metadatas'][0], results['distances'][0])):
    print(f"{i+1}. {metadata['disease']} (similarity: {1-distance:.3f})")
    print(f"   {doc[:100]}...")
    print()


In [None]:
# Visualize disease distribution
plt.figure(figsize=(12, 8))
disease_counts.head(15).plot(kind='bar')
plt.title('Top 15 Diseases by Frequency')
plt.xlabel('Disease')
plt.ylabel('Number of Records')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


## 3. Create Vector Embeddings


In [None]:
# Initialize sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Sentence transformer model loaded")

# Create embeddings for diseases and symptoms
disease_embeddings = []
disease_texts = []

for _, row in df_merged.iterrows():
    # Create text representation of disease
    symptoms = []
    for col in [f'Symptom_{i}' for i in range(1, 18)]:
        if pd.notna(row[col]) and row[col].strip():
            symptoms.append(row[col])
    
    disease_text = f"Disease: {row['Disease']}. Symptoms: {', '.join(symptoms)}"
    if pd.notna(row.get('Description', '')):
        disease_text += f" Description: {row['Description']}"
    
    disease_texts.append(disease_text)
    disease_embeddings.append(model.encode(disease_text))

print(f"Created {len(disease_embeddings)} embeddings")
print(f"Embedding dimension: {len(disease_embeddings[0])}")
