# NeuroSmriti - Alzheimer's Dataset Generation

This notebook generates a large-scale synthetic Alzheimer's disease dataset (400K+ records) for training AI models.

## Data Sources Referenced:
- ADNI (Alzheimer's Disease Neuroimaging Initiative)
- OASIS (Open Access Series of Imaging Studies)
- NACC (National Alzheimer's Coordinating Center)
- UK Biobank
- WHO Dementia Guidelines

In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn tqdm

In [None]:
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from typing import Dict, List, Any, Tuple
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

print("Libraries loaded successfully!")

## Clinical Reference Data

These ranges and distributions are based on clinical research from the referenced studies.

In [None]:
# CDR (Clinical Dementia Rating) Stages
CDR_STAGES = {
    0: "Normal",
    0.5: "Very Mild Dementia (MCI)",
    1: "Mild Dementia",
    2: "Moderate Dementia",
    3: "Severe Dementia"
}

# Cognitive Score Ranges by Stage
SCORE_RANGES = {
    "mmse": {
        "normal": (27, 30), "mci": (24, 27), "mild": (20, 24),
        "moderate": (10, 20), "severe": (0, 10)
    },
    "moca": {
        "normal": (26, 30), "mci": (22, 26), "mild": (17, 22),
        "moderate": (10, 17), "severe": (0, 10)
    }
}

# APOE Genotypes and Risk Factors
APOE_DATA = {
    "e2/e2": {"freq": 0.01, "risk": 0.6},
    "e2/e3": {"freq": 0.11, "risk": 0.6},
    "e2/e4": {"freq": 0.02, "risk": 2.6},
    "e3/e3": {"freq": 0.60, "risk": 1.0},
    "e3/e4": {"freq": 0.21, "risk": 3.2},
    "e4/e4": {"freq": 0.05, "risk": 14.9}
}

# Brain Volumes (mm³)
BRAIN_VOLUMES = {
    "hippocampus": {"normal": (3200, 3800), "mci": (2800, 3200), "ad": (2200, 2800)},
    "entorhinal": {"normal": (1800, 2200), "mci": (1500, 1800), "ad": (1000, 1500)},
    "total_brain": {"normal": (1100000, 1300000), "mci": (1000000, 1100000), "ad": (850000, 1000000)}
}

# CSF Biomarkers (pg/mL)
CSF_MARKERS = {
    "abeta42": {"normal": (500, 1200), "mci": (350, 500), "ad": (150, 350)},
    "total_tau": {"normal": (100, 300), "mci": (300, 500), "ad": (500, 1200)},
    "ptau181": {"normal": (15, 40), "mci": (40, 70), "ad": (70, 150)}
}

print("Clinical reference data loaded!")

In [None]:
# Name databases for realistic patient generation
MALE_NAMES = ["James", "John", "Robert", "Michael", "William", "David", "Richard", "Joseph", "Thomas", "Charles",
              "Christopher", "Daniel", "Matthew", "Anthony", "Mark", "Donald", "Steven", "Paul", "Andrew", "Joshua"]

FEMALE_NAMES = ["Mary", "Patricia", "Jennifer", "Linda", "Barbara", "Elizabeth", "Susan", "Jessica", "Sarah", "Karen",
                "Lisa", "Nancy", "Betty", "Margaret", "Sandra", "Ashley", "Kimberly", "Emily", "Donna", "Michelle"]

LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez",
              "Hernandez", "Lopez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee"]

ETHNICITIES = {
    "White/Caucasian": 0.58,
    "Black/African American": 0.13,
    "Hispanic/Latino": 0.18,
    "Asian": 0.06,
    "Other": 0.05
}

print(f"Loaded {len(MALE_NAMES)} male names, {len(FEMALE_NAMES)} female names, {len(LAST_NAMES)} last names")

## Patient Data Generator Class

In [None]:
class AlzheimersDataGenerator:
    """Generate synthetic Alzheimer's patient data."""
    
    def __init__(self, seed=42):
        random.seed(seed)
        np.random.seed(seed)
        
    def _rand_range(self, r: Tuple[float, float]) -> float:
        """Get random value in range."""
        return random.uniform(r[0], r[1])
    
    def _weighted_choice(self, options: Dict, key: str = "freq") -> str:
        """Weighted random choice."""
        items = list(options.keys())
        if isinstance(list(options.values())[0], dict):
            weights = [options[k][key] for k in items]
        else:
            weights = list(options.values())
        return random.choices(items, weights=weights)[0]
    
    def generate_patient(self, patient_id: int, stage: str = None) -> Dict:
        """Generate a complete patient record."""
        # Determine stage
        if stage is None:
            stage = random.choices(
                ["normal", "mci", "mild", "moderate", "severe"],
                weights=[0.30, 0.25, 0.20, 0.15, 0.10]
            )[0]
        
        stage_key = "ad" if stage in ["mild", "moderate", "severe"] else stage
        if stage_key not in ["normal", "mci"]:
            stage_key = "ad"
        
        # Demographics
        age_ranges = {"normal": (55, 78), "mci": (60, 82), "mild": (65, 88), "moderate": (70, 92), "severe": (75, 98)}
        age = int(self._rand_range(age_ranges[stage]))
        
        female_prob = {"normal": 0.50, "mci": 0.53, "mild": 0.56, "moderate": 0.58, "severe": 0.60}
        is_female = random.random() < female_prob[stage]
        
        first_name = random.choice(FEMALE_NAMES if is_female else MALE_NAMES)
        last_name = random.choice(LAST_NAMES)
        
        # Genetics
        apoe = self._weighted_choice(APOE_DATA)
        
        # Cognitive Scores
        mmse = round(self._rand_range(SCORE_RANGES["mmse"][stage]), 1)
        moca = round(self._rand_range(SCORE_RANGES["moca"][stage]), 1)
        cdr_map = {"normal": 0, "mci": 0.5, "mild": 1, "moderate": 2, "severe": 3}
        cdr = cdr_map[stage]
        
        # Brain Volumes
        hippocampus = round(self._rand_range(BRAIN_VOLUMES["hippocampus"][stage_key]))
        entorhinal = round(self._rand_range(BRAIN_VOLUMES["entorhinal"][stage_key]))
        total_brain = round(self._rand_range(BRAIN_VOLUMES["total_brain"][stage_key]))
        
        # CSF Biomarkers
        abeta42 = round(self._rand_range(CSF_MARKERS["abeta42"][stage_key]), 1)
        total_tau = round(self._rand_range(CSF_MARKERS["total_tau"][stage_key]), 1)
        ptau181 = round(self._rand_range(CSF_MARKERS["ptau181"][stage_key]), 1)
        
        # PET Imaging
        amyloid_suvr = round(self._rand_range({"normal": (0.8, 1.1), "mci": (1.1, 1.4), "ad": (1.4, 2.2)}[stage_key]), 2)
        tau_suvr = round(self._rand_range({"normal": (0.9, 1.1), "mci": (1.1, 1.5), "ad": (1.5, 2.5)}[stage_key]), 2)
        
        # Medical History
        education = max(0, int(random.gauss({"normal": 14, "mci": 13, "mild": 12, "moderate": 11, "severe": 10}[stage], 3)))
        
        return {
            "patient_id": f"NS-{patient_id:07d}",
            "first_name": first_name,
            "last_name": last_name,
            "age": age,
            "gender": "Female" if is_female else "Male",
            "ethnicity": self._weighted_choice(ETHNICITIES),
            "education_years": education,
            "apoe_genotype": apoe,
            "has_apoe4": "e4" in apoe,
            "family_history_ad": random.random() < {"normal": 0.15, "mci": 0.25, "mild": 0.35, "moderate": 0.40, "severe": 0.45}[stage],
            "mmse_total": mmse,
            "moca_total": moca,
            "cdr_global": cdr,
            "diagnosis_stage": stage,
            "hippocampus_volume": hippocampus,
            "entorhinal_volume": entorhinal,
            "total_brain_volume": total_brain,
            "csf_abeta42": abeta42,
            "csf_total_tau": total_tau,
            "csf_ptau181": ptau181,
            "amyloid_pet_suvr": amyloid_suvr,
            "tau_pet_suvr": tau_suvr,
            "amyloid_positive": abeta42 < 500,
            "tau_positive": ptau181 > 40,
            "hypertension": random.random() < 0.3 + 0.3 * (age - 50) / 50,
            "diabetes": random.random() < 0.15 + 0.15 * (age - 50) / 50,
            "depression": random.random() < 0.2,
            "smoking": random.choice(["Never", "Former", "Current"]),
            "physical_activity": random.choice(["Sedentary", "Light", "Moderate", "Active"])
        }

generator = AlzheimersDataGenerator()
print("Generator initialized!")

## Generate 420,000+ Patient Records

In [None]:
# Configuration
TOTAL_PATIENTS = 420000  # 420K patients

print(f"Generating {TOTAL_PATIENTS:,} patient records...")
print("This may take a few minutes...\n")

start_time = datetime.now()
patients = []

for i in tqdm(range(TOTAL_PATIENTS)):
    patient = generator.generate_patient(i)
    patients.append(patient)

elapsed = (datetime.now() - start_time).total_seconds()
print(f"\nGenerated {len(patients):,} patients in {elapsed:.1f} seconds")
print(f"Rate: {len(patients)/elapsed:.0f} patients/second")

In [None]:
# Convert to DataFrame
df = pd.DataFrame(patients)
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()

## Dataset Statistics and Visualization

In [None]:
# Stage distribution
stage_counts = df['diagnosis_stage'].value_counts()
print("Stage Distribution:")
for stage, count in stage_counts.items():
    print(f"  {stage.capitalize()}: {count:,} ({count/len(df)*100:.1f}%)")

# Gender distribution
print("\nGender Distribution:")
for gender, count in df['gender'].value_counts().items():
    print(f"  {gender}: {count:,} ({count/len(df)*100:.1f}%)")

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Stage distribution
stage_order = ['normal', 'mci', 'mild', 'moderate', 'severe']
colors = ['#2ecc71', '#f1c40f', '#e67e22', '#e74c3c', '#9b59b6']
ax1 = axes[0, 0]
df['diagnosis_stage'].value_counts().reindex(stage_order).plot(kind='bar', ax=ax1, color=colors)
ax1.set_title('Disease Stage Distribution')
ax1.set_xlabel('Stage')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Age distribution by stage
ax2 = axes[0, 1]
for stage in stage_order:
    subset = df[df['diagnosis_stage'] == stage]['age']
    ax2.hist(subset, alpha=0.5, label=stage, bins=20)
ax2.set_title('Age Distribution by Stage')
ax2.set_xlabel('Age')
ax2.set_ylabel('Count')
ax2.legend()

# MMSE by stage
ax3 = axes[0, 2]
df.boxplot(column='mmse_total', by='diagnosis_stage', ax=ax3, positions=range(5))
ax3.set_title('MMSE Scores by Stage')
ax3.set_xlabel('Stage')
ax3.set_ylabel('MMSE Score')
plt.suptitle('')

# Gender distribution
ax4 = axes[1, 0]
df['gender'].value_counts().plot(kind='pie', ax=ax4, autopct='%1.1f%%', colors=['#3498db', '#e91e63'])
ax4.set_title('Gender Distribution')
ax4.set_ylabel('')

# APOE genotype distribution
ax5 = axes[1, 1]
df['apoe_genotype'].value_counts().plot(kind='bar', ax=ax5, color='#9b59b6')
ax5.set_title('APOE Genotype Distribution')
ax5.set_xlabel('Genotype')
ax5.set_ylabel('Count')
ax5.tick_params(axis='x', rotation=45)

# Hippocampus volume by stage
ax6 = axes[1, 2]
df.boxplot(column='hippocampus_volume', by='diagnosis_stage', ax=ax6, positions=range(5))
ax6.set_title('Hippocampus Volume by Stage')
ax6.set_xlabel('Stage')
ax6.set_ylabel('Volume (mm³)')
plt.suptitle('')

plt.tight_layout()
plt.savefig('../data/dataset_statistics.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Correlation heatmap for numeric features
numeric_cols = ['age', 'education_years', 'mmse_total', 'moca_total', 'cdr_global',
                'hippocampus_volume', 'csf_abeta42', 'csf_ptau181', 'amyloid_pet_suvr', 'tau_pet_suvr']

plt.figure(figsize=(12, 10))
correlation = df[numeric_cols].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig('../data/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## Save Dataset

In [None]:
import os

# Create data directory if it doesn't exist
os.makedirs('../data', exist_ok=True)

# Save as CSV
csv_path = '../data/alzheimers_420k_dataset.csv'
df.to_csv(csv_path, index=False)
csv_size = os.path.getsize(csv_path) / (1024 * 1024)
print(f"Saved CSV: {csv_path} ({csv_size:.1f} MB)")

# Save as JSON with metadata
metadata = {
    "dataset_name": "NeuroSmriti Alzheimer's Training Dataset",
    "version": "2.0.0",
    "generated_at": datetime.now().isoformat(),
    "total_records": len(df),
    "features": list(df.columns),
    "stage_distribution": df['diagnosis_stage'].value_counts().to_dict(),
    "gender_distribution": df['gender'].value_counts().to_dict()
}

json_output = {
    "metadata": metadata,
    "data": patients
}

json_path = '../data/alzheimers_420k_dataset.json'
with open(json_path, 'w') as f:
    json.dump(json_output, f)

json_size = os.path.getsize(json_path) / (1024 * 1024)
print(f"Saved JSON: {json_path} ({json_size:.1f} MB)")

print(f"\nTotal records: {len(df):,}")
print(f"Total features: {len(df.columns)}")

In [None]:
# Summary statistics
print("\n" + "=" * 60)
print("DATASET SUMMARY")
print("=" * 60)
df.describe()