# Kaggle Disease Dataset Integration

This notebook demonstrates how to download, process, and integrate the Kaggle disease-symptom dataset into the Medical RAG system.

## Overview
- Load Kaggle Disease Dataset
- Process symptoms and diseases
- Create Q&A pairs for training
- Generate knowledge base
- Test dataset integration


In [None]:
# Import required libraries
import sys
import os
sys.path.append('../')

import pandas as pd
import numpy as np
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## 1. Load and Process Dataset


In [None]:
# Load the Kaggle dataset
dataset_path = Path("../data/raw/Disease Dataset/dataset.csv")
descriptions_path = Path("../data/raw/Disease Dataset/symptom_Description.csv")
precautions_path = Path("../data/raw/Disease Dataset/symptom_precaution.csv")

print("Loading Kaggle Disease Dataset...")

# Load main dataset
df = pd.read_csv(dataset_path)
print(f"Main dataset loaded: {len(df)} records")

# Load descriptions
descriptions_df = pd.read_csv(descriptions_path)
print(f"Descriptions loaded: {len(descriptions_df)} diseases")

# Load precautions
precautions_df = pd.read_csv(precautions_path)
print(f"Precautions loaded: {len(precautions_df)} diseases")

# Merge datasets
df_merged = df.merge(descriptions_df, on='Disease', how='left')
df_merged = df_merged.merge(precautions_df, on='Disease', how='left')

print(f"Final merged dataset: {len(df_merged)} records with {len(df_merged.columns)} columns")
df_merged.head()


## 2. Create Q&A Pairs


In [None]:
# Create Q&A pairs for training
qa_pairs = []

for _, row in df_merged.iterrows():
    # Extract symptoms
    symptoms = []
    for col in [f'Symptom_{i}' for i in range(1, 18)]:
        if pd.notna(row[col]) and row[col].strip():
            symptoms.append(row[col])
    
    if len(symptoms) == 0:
        continue
    
    # Create different types of questions
    questions = [
        f"What disease causes {', '.join(symptoms[:3])}?",
        f"I have {', '.join(symptoms[:2])}, what could be wrong?",
        f"What condition is associated with {symptoms[0]} and {symptoms[1] if len(symptoms) > 1 else 'other symptoms'}?",
        f"Help me identify the disease with symptoms: {', '.join(symptoms[:4])}"
    ]
    
    # Create comprehensive answer
    answer_parts = [f"The disease is {row['Disease']}."]
    
    if pd.notna(row.get('Description', '')):
        answer_parts.append(f"Description: {row['Description']}")
    
    if pd.notna(row.get('Precaution_1', '')):
        precautions = [row[f'Precaution_{i}'] for i in range(1, 5) if pd.notna(row.get(f'Precaution_{i}', ''))]
        answer_parts.append(f"Precautions: {', '.join(precautions)}")
    
    answer = " ".join(answer_parts)
    
    # Add Q&A pairs
    for question in questions:
        qa_pairs.append({
            'question': question,
            'answer': answer,
            'disease': row['Disease'],
            'symptoms': symptoms,
            'context': f"Disease: {row['Disease']}, Symptoms: {', '.join(symptoms)}"
        })

print(f"Created {len(qa_pairs)} Q&A pairs")
print(f"Sample Q&A pair:")
print(f"Q: {qa_pairs[0]['question']}")
print(f"A: {qa_pairs[0]['answer'][:100]}...")


## 3. Train-Test Split and Save Data


In [None]:
# Split data into train and test sets
train_data, test_data = train_test_split(qa_pairs, test_size=0.2, random_state=42, stratify=[qa['disease'] for qa in qa_pairs])

print(f"Training data: {len(train_data)} samples")
print(f"Test data: {len(test_data)} samples")

# Create output directory
output_dir = Path("../data/processed")
output_dir.mkdir(parents=True, exist_ok=True)

# Save processed data
with open(output_dir / "qa_pairs.json", "w") as f:
    json.dump(qa_pairs, f, indent=2)

with open(output_dir / "train_qa_pairs.json", "w") as f:
    json.dump(train_data, f, indent=2)

with open(output_dir / "test_qa_pairs.json", "w") as f:
    json.dump(test_data, f, indent=2)

print("Processed data saved to data/processed/")
print(f"- qa_pairs.json: {len(qa_pairs)} total pairs")
print(f"- train_qa_pairs.json: {len(train_data)} training pairs")
print(f"- test_qa_pairs.json: {len(test_data)} test pairs")
