In [None]:
import csv
import numpy as np
# import pandas as pd # Not strictly needed for this CSV version
from datetime import datetime, timedelta, date
import random
import os
from tqdm import tqdm # Progress bar library (install: pip install tqdm)
import time # To time the execution

# --- Configuration ---
NUM_USERS = 30       # Keep small for sample generation
START_YEAR = 2022
END_YEAR = 2024
OUTPUT_FILE = f'synthetic_health_data_{START_YEAR}-{END_YEAR}_varied_hw_bmi.csv' # New filename

# --- Date Calculation ---
START_DATE = date(START_YEAR, 1, 1)
END_DATE = date(END_YEAR, 12, 31)
ROWS_PER_USER = (END_DATE - START_DATE).days + 1

# --- Base Parameters ---
INITIAL_AGE_RANGE = (18, 85)
GENDER_DISTRIBUTION = ['Male', 'Female', 'Other']
GENDER_PROBS = [0.48, 0.48, 0.04]

# --- User Category Probabilities for H/W/BMI ---
PROB_NO_HW_DATA = 0.15      # 15% users have no H/W/BMI data
PROB_ANNUAL_WEIGHT_UPDATE = 0.25 # 25% users update weight annually
# Remaining (1 - 0.15 - 0.25 = 0.60) will have static H/W/BMI data

# --- Height Parameters (Static per User, if recorded) ---
HEIGHT_BASE_MALE_IN = (69.0, 3.5)
HEIGHT_BASE_FEMALE_IN = (64.0, 3.0)
HEIGHT_CLAMP_IN = (55, 80)

# --- Weight Parameters (Based on Age/Gender Averages, if recorded) ---
WEIGHT_STD_DEV_KG = 12.0
AVG_WEIGHT_KG = {
    (20, 29): (74.9, 85.5), (30, 39): (79.3, 94.3), (40, 49): (80.8, 93.9),
    (50, 59): (78.7, 91.9), (60, 69): (78.2, 91.3), (70, 79): (74.7, 87.7),
    (80, 99): (67.9, 80.5)
}
WEIGHT_CLAMP_KG = (40, 180)
BMI_CLAMP = (15, 50)
# For annual updates:
ANNUAL_WEIGHT_CHANGE_MEAN_KG = 0.1 # Small average change
ANNUAL_WEIGHT_CHANGE_STD_KG = 1.0 # Allow variation, including loss

# --- Physiological Parameters (Adjusted by Age Group) ---
# (These remain the same as the previous version)
BP_SYSTOLIC_BASE = (120, 15)
BP_DIASTOLIC_BASE = (80, 10)
HEART_RATE_BASE_YOUNG = (72, 8)
SPO2_BASE_YOUNG = (97.8, 0.8)
ECG_METRIC_BASE = (70, 12)
BODY_TEMP_F_BASE = (98.2, 0.6)
SLEEP_DURATION_BASE_YOUNG = (7.7, 1.0)
SLEEP_QUALITY_BASE_YOUNG = (7.5, 1.3)
ACTIVITY_LEVEL_BASE_MEAN_YOUNG = 6500
ACTIVITY_LEVEL_BASE_STD = 1800
BASE_CALORIES_MEAN_F_YOUNG = 1600
BASE_CALORIES_MEAN_M_YOUNG = 1800
BASE_CALORIES_STD = 150
ADJ_FACTOR_ACTIVITY_MID = 0.90; ADJ_FACTOR_SLEEP_DUR_MID = -0.3; ADJ_FACTOR_SLEEP_QUAL_MID = -0.4; ADJ_FACTOR_HR_MID = -2; ADJ_FACTOR_SPO2_MID = -0.1; ADJ_FACTOR_CALORIES_MID = 0.95
ADJ_FACTOR_ACTIVITY_SENIOR = 0.70; ADJ_FACTOR_SLEEP_DUR_SENIOR = -0.7; ADJ_FACTOR_SLEEP_QUAL_SENIOR = -1.0; ADJ_FACTOR_HR_SENIOR = -5; ADJ_FACTOR_SPO2_SENIOR = -0.4; ADJ_FACTOR_CALORIES_SENIOR = 0.85

# Daily variation/noise
BP_NOISE = (5, 3); HR_NOISE = 5; SPO2_NOISE = 0.5; ECG_NOISE = 6; TEMP_F_NOISE = 0.4; SLEEP_DUR_NOISE = 0.8; SLEEP_QUAL_NOISE = 1.0; STEPS_NOISE_FACTOR = 0.4; CALORIES_BASE_NOISE = 100; CALORIES_PER_STEP = (0.04, 0.05)

# Clamping ranges
BP_CLAMP = (85, 190, 55, 125); HR_CLAMP = (38, 170); SPO2_CLAMP = (88.0, 100.0); ECG_CLAMP = (38, 170); TEMP_F_CLAMP = (95.0, 101.5); STEPS_CLAMP = (0, 40000); CALORIES_CLAMP = (800, 5000); SLEEP_DUR_CLAMP = (2.0, 12.0); SLEEP_QUAL_CLAMP = (1.0, 10.0)

# --- Outlier Configuration ---
OUTLIER_PROBABILITY_VITALS = 0.0015 # For non-H/W/BMI metrics
HW_BMI_OUTLIER_PROBABILITY = 0.0003 # Lower probability for H/W/BMI outliers

# Outlier Ranges for Vitals
OUTLIER_LOW_CALORIES_RANGE = (700, 1000); OUTLIER_HIGH_CALORIES_RANGE = (4500, 6500)
OUTLIER_HR_RANGE = (170, 210); OUTLIER_LOW_SPO2_RANGE = (80.0, 87.0)
OUTLIER_HIGH_STEPS_RANGE = (40000, 60000); OUTLIER_HIGH_TEMP_F_RANGE = (101.5, 104.0)
OUTLIER_LOW_TEMP_F_RANGE = (93.0, 95.0); OUTLIER_WIDE_BP_RANGE = (185, 220, 110, 130)
OUTLIER_LOW_BP_RANGE = (70, 85, 40, 55)

# Outlier Ranges for H/W/BMI (applied only if user HAS data)
OUTLIER_HIGH_HEIGHT_IN = (81, 88); OUTLIER_LOW_HEIGHT_IN = (48, 54)
OUTLIER_HIGH_WEIGHT_KG = (181, 250); OUTLIER_LOW_WEIGHT_KG = (30, 39)
# BMI outliers less common, maybe focus on H/W outliers causing BMI shifts

# --- Data Generation ---

print(f"Starting data generation for {NUM_USERS} users...")
print(f"Data Period: {START_DATE.strftime('%Y-%m-%d')} to {END_DATE.strftime('%Y-%m-%d')}")
print(f"Output file: {OUTPUT_FILE}")
print(f"H/W/BMI Logic: {PROB_NO_HW_DATA*100:.0f}% None, {PROB_ANNUAL_WEIGHT_UPDATE*100:.0f}% Annual Update, {(1-PROB_NO_HW_DATA-PROB_ANNUAL_WEIGHT_UPDATE)*100:.0f}% Static")

# Generate static user data first, determining H/W/BMI category
user_static_data = {}
print("Generating static user info (determining H/W/BMI category)...")
for i in tqdm(range(NUM_USERS), desc="User Info"):
    user_id = f"user_{i:04d}"
    initial_age = random.randint(*INITIAL_AGE_RANGE)
    birth_year = START_DATE.year - initial_age
    gender = random.choices(GENDER_DISTRIBUTION, weights=GENDER_PROBS, k=1)[0]

    # --- Determine User Category for H/W/BMI ---
    rand_hw = random.random()
    height_in = None
    initial_weight_kg = None
    static_bmi = None
    annual_weight_change_kg = 0.0
    update_type = 'None' # Default

    if rand_hw < PROB_NO_HW_DATA:
        update_type = 'None'
    elif rand_hw < PROB_NO_HW_DATA + PROB_ANNUAL_WEIGHT_UPDATE:
        update_type = 'Annual'
    else:
        update_type = 'Static'

    # --- Generate H/W/BMI only if user is NOT 'None' category ---
    if update_type != 'None':
        # Generate STATIC Height
        if gender == 'Male':
            height_in = np.random.normal(HEIGHT_BASE_MALE_IN[0], HEIGHT_BASE_MALE_IN[1])
        elif gender == 'Other':
            avg_mean_height = (HEIGHT_BASE_MALE_IN[0] + HEIGHT_BASE_FEMALE_IN[0]) / 2
            avg_std_height = (HEIGHT_BASE_MALE_IN[1] + HEIGHT_BASE_FEMALE_IN[1]) / 2
            height_in = np.random.normal(avg_mean_height, avg_std_height)
        else: # Female
            height_in = np.random.normal(HEIGHT_BASE_FEMALE_IN[0], HEIGHT_BASE_FEMALE_IN[1])
        height_in = np.clip(height_in, HEIGHT_CLAMP_IN[0], HEIGHT_CLAMP_IN[1])
        height_m = height_in * 0.0254

        # Generate Initial/Static Weight
        target_weight_mean_kg = 70
        for age_range, weights in AVG_WEIGHT_KG.items():
            if age_range[0] <= initial_age <= age_range[1]:
                if gender == 'Male': target_weight_mean_kg = weights[1]
                elif gender == 'Female': target_weight_mean_kg = weights[0]
                else: target_weight_mean_kg = (weights[0] + weights[1]) / 2
                break
        initial_weight_kg = np.random.normal(target_weight_mean_kg, WEIGHT_STD_DEV_KG)
        initial_weight_kg = np.clip(initial_weight_kg, WEIGHT_CLAMP_KG[0], WEIGHT_CLAMP_KG[1])

        # Calculate Initial/Static BMI
        if height_m > 0:
            static_bmi = initial_weight_kg / (height_m ** 2)
            static_bmi = np.clip(static_bmi, BMI_CLAMP[0], BMI_CLAMP[1])
        else:
            static_bmi = 0 # Or None

        # If Annual Update, generate the annual change factor
        if update_type == 'Annual':
            annual_weight_change_kg = np.random.normal(ANNUAL_WEIGHT_CHANGE_MEAN_KG, ANNUAL_WEIGHT_CHANGE_STD_KG)

    # --- Determine Age Group and Adjust OTHER metrics (always needed) ---
    # (This logic remains the same as before)
    if 18 <= initial_age <= 39:
        age_group = 'Young'; activity_adj = 1.0; sleep_dur_adj = 0.0; sleep_qual_adj = 0.0; hr_adj = 0.0; spo2_adj = 0.0; cal_adj = 1.0
        activity_mean_base = ACTIVITY_LEVEL_BASE_MEAN_YOUNG; cal_mean_base = BASE_CALORIES_MEAN_F_YOUNG if gender == 'Female' else BASE_CALORIES_MEAN_M_YOUNG
        hr_base_mean = HEART_RATE_BASE_YOUNG[0]; spo2_base_mean = SPO2_BASE_YOUNG[0]; sleep_dur_base_mean = SLEEP_DURATION_BASE_YOUNG[0]; sleep_qual_base_mean = SLEEP_QUALITY_BASE_YOUNG[0]
    elif 40 <= initial_age <= 64:
        age_group = 'Middle'; activity_adj = ADJ_FACTOR_ACTIVITY_MID; sleep_dur_adj = ADJ_FACTOR_SLEEP_DUR_MID; sleep_qual_adj = ADJ_FACTOR_SLEEP_QUAL_MID; hr_adj = ADJ_FACTOR_HR_MID; spo2_adj = ADJ_FACTOR_SPO2_MID; cal_adj = ADJ_FACTOR_CALORIES_MID
        activity_mean_base = ACTIVITY_LEVEL_BASE_MEAN_YOUNG * activity_adj; cal_mean_base = (BASE_CALORIES_MEAN_F_YOUNG if gender == 'Female' else BASE_CALORIES_MEAN_M_YOUNG) * cal_adj
        hr_base_mean = HEART_RATE_BASE_YOUNG[0] + hr_adj; spo2_base_mean = SPO2_BASE_YOUNG[0] + spo2_adj; sleep_dur_base_mean = SLEEP_DURATION_BASE_YOUNG[0] + sleep_dur_adj; sleep_qual_base_mean = SLEEP_QUALITY_BASE_YOUNG[0] + sleep_qual_adj
    else: # 65+
        age_group = 'Senior'; activity_adj = ADJ_FACTOR_ACTIVITY_SENIOR; sleep_dur_adj = ADJ_FACTOR_SLEEP_DUR_SENIOR; sleep_qual_adj = ADJ_FACTOR_SLEEP_QUAL_SENIOR; hr_adj = ADJ_FACTOR_HR_SENIOR; spo2_adj = ADJ_FACTOR_SPO2_SENIOR; cal_adj = ADJ_FACTOR_CALORIES_SENIOR
        activity_mean_base = ACTIVITY_LEVEL_BASE_MEAN_YOUNG * activity_adj; cal_mean_base = (BASE_CALORIES_MEAN_F_YOUNG if gender == 'Female' else BASE_CALORIES_MEAN_M_YOUNG) * cal_adj
        hr_base_mean = HEART_RATE_BASE_YOUNG[0] + hr_adj; spo2_base_mean = SPO2_BASE_YOUNG[0] + spo2_adj; sleep_dur_base_mean = SLEEP_DURATION_BASE_YOUNG[0] + sleep_dur_adj; sleep_qual_base_mean = SLEEP_QUALITY_BASE_YOUNG[0] + sleep_qual_adj

    # Generate user-specific base values for fluctuating metrics
    user_activity_base = max(500, np.random.normal(activity_mean_base, ACTIVITY_LEVEL_BASE_STD * activity_adj))
    user_base_calories = max(1000, np.random.normal(cal_mean_base, BASE_CALORIES_STD * cal_adj))
    user_hr_base = max(45, np.random.normal(hr_base_mean, HEART_RATE_BASE_YOUNG[1]))
    user_spo2_base = max(90.0, np.random.normal(spo2_base_mean, SPO2_BASE_YOUNG[1]))
    user_sleep_dur_base = max(4.0, np.random.normal(sleep_dur_base_mean, SLEEP_DURATION_BASE_YOUNG[1]))
    user_sleep_qual_base = max(2.0, np.random.normal(sleep_qual_base_mean, SLEEP_QUALITY_BASE_YOUNG[1]))
    user_temp_f_base = np.random.normal(BODY_TEMP_F_BASE[0], BODY_TEMP_F_BASE[1])
    user_ecg_base = np.random.normal(ECG_METRIC_BASE[0], ECG_METRIC_BASE[1])

    # --- Store all data ---
    user_static_data[user_id] = {
        'birth_year': birth_year, 'gender': gender,
        'height_inches': height_in,       # Can be None
        'initial_weight_kg': initial_weight_kg,# Can be None, used for Static and Annual start
        'static_bmi': static_bmi,         # Can be None, used for Static
        'update_type': update_type,       # 'None', 'Static', 'Annual'
        'annual_weight_change_kg': annual_weight_change_kg, # Used for Annual
        'initial_age_group': age_group,
        'user_base_calories': user_base_calories, 'user_hr_base': user_hr_base,
        'user_spo2_base': user_spo2_base, 'user_temp_f_base': user_temp_f_base,
        'user_ecg_base': user_ecg_base, 'user_activity_base': user_activity_base,
        'user_sleep_dur_base': user_sleep_dur_base, 'user_sleep_qual_base': user_sleep_qual_base,
    }

# Open CSV file and write header
with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = [
        'user_id', 'age', 'gender', 'datestamp', 'height_inches', 'weight_kg', 'bmi',
        'bp', 'heart_rate', 'spo2', 'ecg_avg', 'body_temp_f', 'step_count',
        'calories_burnt', 'sleep_duration_hr', 'sleep_quality_score'
    ]
    writer = csv.writer(csvfile)
    writer.writerow(fieldnames)

    total_rows_written = 0
    start_time = time.time()

    # Loop through users and generate time series data row by row
    for user_id, static_info in tqdm(user_static_data.items(), desc="Writing User Data"):
        birth_year = static_info['birth_year']
        gender = static_info['gender']
        update_type = static_info['update_type']

        # Retrieve H/W/BMI info (might be None)
        height_in = static_info['height_inches']
        initial_weight = static_info['initial_weight_kg'] # Use initial weight as starting point
        static_bmi_val = static_info['static_bmi']
        annual_change = static_info['annual_weight_change_kg']
        height_m = height_in * 0.0254 if height_in is not None else 0

        # Initialize current weight for annual updates
        current_weight_kg = initial_weight if initial_weight is not None else None

        # Retrieve bases for fluctuating metrics
        base_calories = static_info['user_base_calories']; hr_base = static_info['user_hr_base']
        spo2_base = static_info['user_spo2_base']; temp_f_base = static_info['user_temp_f_base']
        ecg_base = static_info['user_ecg_base']; activity_base = static_info['user_activity_base']
        sleep_dur_base = static_info['user_sleep_dur_base']; sleep_qual_base = static_info['user_sleep_qual_base']

        # Minimal daily drift for HR/Temp base
        hr_drift_rate = np.random.normal(0, 0.0005)
        temp_f_drift_rate = np.random.normal(0, 0.0001)
        current_hr_base = hr_base
        current_temp_f_base = temp_f_base

        current_date = START_DATE

        for i in range(ROWS_PER_USER):
            current_age = current_date.year - birth_year

            # --- Handle H/W/BMI based on update_type ---
            output_height = ""
            output_weight = ""
            output_bmi = ""
            temp_height_for_outlier = height_in # Store for potential outlier modification
            temp_weight_for_outlier = None
            temp_bmi_for_outlier = None


            if update_type == 'Static':
                output_height = f"{height_in:.1f}" if height_in is not None else ""
                output_weight = f"{initial_weight:.1f}" if initial_weight is not None else ""
                output_bmi = f"{static_bmi_val:.1f}" if static_bmi_val is not None else ""
                temp_weight_for_outlier = initial_weight
                temp_bmi_for_outlier = static_bmi_val
            elif update_type == 'Annual':
                # Apply annual change at the start of each year (except the very first day)
                # Using day 1 of month 1 is a simple way to trigger yearly change
                if i > 0 and current_date.day == 1 and current_date.month == 1 and current_weight_kg is not None:
                    current_weight_kg += annual_change
                    current_weight_kg = np.clip(current_weight_kg, WEIGHT_CLAMP_KG[0], WEIGHT_CLAMP_KG[1])

                # Calculate current BMI for the day
                current_bmi = None
                if current_weight_kg is not None and height_m > 0:
                    current_bmi = current_weight_kg / (height_m ** 2)
                    current_bmi = np.clip(current_bmi, BMI_CLAMP[0], BMI_CLAMP[1])

                output_height = f"{height_in:.1f}" if height_in is not None else ""
                output_weight = f"{current_weight_kg:.1f}" if current_weight_kg is not None else ""
                output_bmi = f"{current_bmi:.1f}" if current_bmi is not None else ""
                temp_weight_for_outlier = current_weight_kg
                temp_bmi_for_outlier = current_bmi
            # Else (update_type == 'None'), outputs remain ""

            # --- Generate Daily Values for OTHER fluctuating metrics ---
            # (This logic remains the same as before)
            bp_age_factor_sys = max(0, (current_age - 40) * 0.12); bp_age_factor_dia = max(0, (current_age - 40) * 0.07)
            sys_bp = np.random.normal(BP_SYSTOLIC_BASE[0] + bp_age_factor_sys, BP_NOISE[0]); dia_bp = np.random.normal(BP_DIASTOLIC_BASE[0] + bp_age_factor_dia, BP_NOISE[1])
            sys_bp = np.clip(sys_bp, BP_CLAMP[0], BP_CLAMP[1]); dia_bp = np.clip(dia_bp, BP_CLAMP[2], BP_CLAMP[3])
            if dia_bp >= sys_bp: dia_bp = sys_bp - random.uniform(10, 30)
            hr_val = np.random.normal(current_hr_base, HR_NOISE); hr_val = np.clip(hr_val, HR_CLAMP[0], HR_CLAMP[1])
            spo2_val = np.random.normal(spo2_base, SPO2_NOISE); spo2_val = np.clip(spo2_val, SPO2_CLAMP[0], SPO2_CLAMP[1])
            ecg_val = np.random.normal(ecg_base + (current_hr_base - HEART_RATE_BASE_YOUNG[0])*0.5, ECG_NOISE); ecg_val = np.clip(ecg_val, ECG_CLAMP[0], ECG_CLAMP[1])
            temp_val = np.random.normal(current_temp_f_base, TEMP_F_NOISE); temp_val = np.clip(temp_val, TEMP_F_CLAMP[0], TEMP_F_CLAMP[1])
            sleep_h_val = np.random.normal(sleep_dur_base, SLEEP_DUR_NOISE); sleep_h_val = np.clip(sleep_h_val, SLEEP_DUR_CLAMP[0], SLEEP_DUR_CLAMP[1])
            sleep_q_val = np.random.normal(sleep_qual_base, SLEEP_QUAL_NOISE)
            if 6.5 < sleep_h_val < 8.5: sleep_q_val += random.uniform(0, 0.5)
            else: sleep_q_val -= random.uniform(0, 0.5)
            sleep_q_val = np.clip(sleep_q_val, SLEEP_QUAL_CLAMP[0], SLEEP_QUAL_CLAMP[1])
            steps_val = int(np.clip(np.random.poisson(activity_base * (1 + random.uniform(-0.4, 0.4))), STEPS_CLAMP[0], STEPS_CLAMP[1]))
            daily_cal_age_factor = 1.0 - max(0, (current_age - 40) * 0.001)
            calories_val = (base_calories * daily_cal_age_factor) + np.random.normal(0, CALORIES_BASE_NOISE) + steps_val * random.uniform(CALORIES_PER_STEP[0], CALORIES_PER_STEP[1])
            calories_val = np.clip(calories_val, CALORIES_CLAMP[0], CALORIES_CLAMP[1])

            # --- Introduce Outliers ---
            # Vital Outliers
            if random.random() < OUTLIER_PROBABILITY_VITALS: calories_val = random.uniform(*OUTLIER_LOW_CALORIES_RANGE) if random.random() < 0.5 else random.uniform(*OUTLIER_HIGH_CALORIES_RANGE)
            if random.random() < OUTLIER_PROBABILITY_VITALS: hr_val = random.uniform(*OUTLIER_HR_RANGE)
            if random.random() < OUTLIER_PROBABILITY_VITALS: spo2_val = random.uniform(*OUTLIER_LOW_SPO2_RANGE)
            if random.random() < OUTLIER_PROBABILITY_VITALS: steps_val = int(random.uniform(*OUTLIER_HIGH_STEPS_RANGE))
            if random.random() < OUTLIER_PROBABILITY_VITALS: temp_val = random.uniform(*OUTLIER_HIGH_TEMP_F_RANGE) if random.random() < 0.8 else random.uniform(*OUTLIER_LOW_TEMP_F_RANGE)
            if random.random() < OUTLIER_PROBABILITY_VITALS:
                if random.random() < 0.7: sys_bp, dia_bp = random.uniform(OUTLIER_WIDE_BP_RANGE[0], OUTLIER_WIDE_BP_RANGE[1]), random.uniform(OUTLIER_WIDE_BP_RANGE[2], OUTLIER_WIDE_BP_RANGE[3])
                else: sys_bp, dia_bp = random.uniform(OUTLIER_LOW_BP_RANGE[0], OUTLIER_LOW_BP_RANGE[1]), random.uniform(OUTLIER_LOW_BP_RANGE[2], OUTLIER_LOW_BP_RANGE[3])
                if dia_bp >= sys_bp: dia_bp = sys_bp - random.uniform(5, 15)

            # H/W/BMI Outliers (only if user has data)
            if update_type != 'None' and random.random() < HW_BMI_OUTLIER_PROBABILITY:
                outlier_choice = random.choice(['height', 'weight']) # Focus on H/W outliers
                if outlier_choice == 'height' and temp_height_for_outlier is not None:
                    temp_height_for_outlier = random.uniform(*OUTLIER_LOW_HEIGHT_IN) if random.random() < 0.5 else random.uniform(*OUTLIER_HIGH_HEIGHT_IN)
                    output_height = f"{temp_height_for_outlier:.1f}"
                    # Recalculate BMI if height outlier occurs and weight exists
                    if temp_weight_for_outlier is not None:
                        new_height_m = temp_height_for_outlier * 0.0254
                        if new_height_m > 0:
                             temp_bmi_for_outlier = temp_weight_for_outlier / (new_height_m ** 2)
                             temp_bmi_for_outlier = np.clip(temp_bmi_for_outlier, BMI_CLAMP[0], BMI_CLAMP[1])
                             output_bmi = f"{temp_bmi_for_outlier:.1f}" if temp_bmi_for_outlier is not None else ""
                elif outlier_choice == 'weight' and temp_weight_for_outlier is not None:
                    temp_weight_for_outlier = random.uniform(*OUTLIER_LOW_WEIGHT_KG) if random.random() < 0.5 else random.uniform(*OUTLIER_HIGH_WEIGHT_KG)
                    output_weight = f"{temp_weight_for_outlier:.1f}"
                    # Recalculate BMI if weight outlier occurs and height exists
                    if height_m > 0:
                         temp_bmi_for_outlier = temp_weight_for_outlier / (height_m ** 2)
                         temp_bmi_for_outlier = np.clip(temp_bmi_for_outlier, BMI_CLAMP[0], BMI_CLAMP[1])
                         output_bmi = f"{temp_bmi_for_outlier:.1f}" if temp_bmi_for_outlier is not None else ""


            # Format final values
            bp_val = f"{int(round(sys_bp))}/{int(round(dia_bp))}"

            # --- Format data for CSV row ---
            row = [
                user_id,
                current_age,
                gender,
                current_date.strftime('%Y-%m-%d'),
                output_height, # Can be ""
                output_weight, # Can be ""
                output_bmi,    # Can be ""
                bp_val,
                f"{hr_val:.1f}",
                f"{spo2_val:.1f}",
                f"{ecg_val:.1f}",
                f"{temp_val:.1f}",
                steps_val,
                f"{calories_val:.0f}",
                f"{sleep_h_val:.1f}",
                f"{sleep_q_val:.1f}"
            ]
            writer.writerow(row)
            total_rows_written += 1

            # Increment date
            current_date += timedelta(days=1)

            # Apply slow drift to HR/Temp base
            current_hr_base += hr_drift_rate
            current_temp_f_base += temp_f_drift_rate
            current_hr_base = np.clip(current_hr_base, HR_CLAMP[0]*0.95, HR_CLAMP[1]*0.9)
            current_temp_f_base = np.clip(current_temp_f_base, TEMP_F_CLAMP[0]*1.01, TEMP_F_CLAMP[1]*0.99)

    end_time = time.time()
    duration = end_time - start_time
    print(f"\n--- Finished Data Generation ---")
    print(f"Total rows written: {total_rows_written:,}")
    print(f"Data saved to: {OUTPUT_FILE}")
    print(f"Total time: {duration:.2f} seconds ({duration/60:.2f} minutes)")

    # Final file size check
    try:
        file_size_bytes = os.path.getsize(OUTPUT_FILE)
        file_size_mb = file_size_bytes / (1024**2)
        print(f"Final file size: {file_size_mb:.2f} MB")
    except FileNotFoundError:
        print(f"Error: Output file {OUTPUT_FILE} not found.")
    except Exception as e:
        print(f"Error checking file size: {e}")

Starting data generation for 30 users...
Data Period: 2022-01-01 to 2024-12-31
Output file: synthetic_health_data_2022-2024_varied_hw_bmi.csv
H/W/BMI Logic: 15% None, 25% Annual Update, 60% Static
Generating static user info (determining H/W/BMI category)...


User Info: 100%|██████████| 30/30 [00:00<00:00, 3399.04it/s]
Writing User Data: 100%|██████████| 30/30 [00:06<00:00,  4.78it/s]


--- Finished Data Generation ---
Total rows written: 32,880
Data saved to: synthetic_health_data_2022-2024_varied_hw_bmi.csv
Total time: 6.29 seconds (0.10 minutes)
Final file size: 2.80 MB





In [None]:
import pandas as pd
import numpy as np # Needed for NaN handling if replacing blanks later

# --- Configuration ---
# !!! IMPORTANT: Set this to the EXACT filename generated by the previous script !!!
FILE_TO_ANALYZE = 'synthetic_health_data_2022-2024_varied_hw_bmi.csv'

print(f"--- Analyzing File: {FILE_TO_ANALYZE} ---")

try:
    # --- Load the Dataset ---
    # Use na_values=[''] to interpret empty strings (like we used for missing H/W/BMI) as NaN
    # Parse the 'datestamp' column as dates directly
    print("Loading dataset into pandas DataFrame...")
    df = pd.read_csv(FILE_TO_ANALYZE, na_values=[''], parse_dates=['datestamp'])
    print("Dataset loaded successfully.")

    # --- Basic Information ---
    print("\n--- DataFrame Info (Columns, Non-Null Counts, Dtypes) ---")
    df.info()

    # --- Summary Statistics ---
    print("\n--- Summary Statistics for Numerical Columns ---")
    # .describe() provides count, mean, std, min, quartiles, max for numeric columns
    # It will automatically handle NaN values by excluding them from calculations
    # except for the count.
    # Use pd.options.display.float_format to prevent scientific notation if desired
    pd.options.display.float_format = '{:.2f}'.format
    print(df.describe())

    print("\n--- Summary Statistics for Non-Numerical (Object/Categorical) Columns ---")
    # Use include='object' to get summary for string/object columns
    # This gives count, unique values, top value, and frequency of top value
    # 'datestamp' is now a datetime type, so it won't be included here.
    print(df.describe(include=['object']))

    # --- Missing Value Analysis ---
    print("\n--- Missing Value Counts per Column ---")
    missing_values = df.isnull().sum() # Counts NaN values per column

    # Filter to show only columns that *have* missing values
    missing_values = missing_values[missing_values > 0]

    if not missing_values.empty:
        print("Columns with missing values:")
        # Calculate percentage of missing values
        total_rows = len(df)
        missing_percentage = (missing_values / total_rows) * 100
        missing_summary = pd.DataFrame({
            'Missing Count': missing_values,
            'Missing Percentage': missing_percentage.map('{:.2f}%'.format) # Format as percentage string
        })
        print(missing_summary)
    else:
        print("No missing values found in the dataset.")

except FileNotFoundError:
    print(f"\n--- ERROR ---")
    print(f"File not found: '{FILE_TO_ANALYZE}'")
    print("Please make sure the file exists in the same directory as the script,")
    print("or provide the correct path, and that the filename matches exactly.")
except Exception as e:
    print(f"\n--- An error occurred during analysis: ---")
    print(e)

print("\n--- Analysis Complete ---")

--- Analyzing File: synthetic_health_data_2022-2024_varied_hw_bmi.csv ---
Loading dataset into pandas DataFrame...
Dataset loaded successfully.

--- DataFrame Info (Columns, Non-Null Counts, Dtypes) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32880 entries, 0 to 32879
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   user_id              32880 non-null  object        
 1   age                  32880 non-null  int64         
 2   gender               32880 non-null  object        
 3   datestamp            32880 non-null  datetime64[ns]
 4   height_inches        28496 non-null  float64       
 5   weight_kg            28496 non-null  float64       
 6   bmi                  28496 non-null  float64       
 7   bp                   32880 non-null  object        
 8   heart_rate           32880 non-null  float64       
 9   spo2                 32880 non-null  float64       
 10 

Overall Purpose:

This script is designed to generate a large (targeting approximately 10GB) synthetic dataset simulating daily health records for a large number of adult users over a 3-year period (2022-2024). It aims for realism by incorporating:

Age-appropriate physiological baselines.
Trends related to aging (e.g., changes in activity, BMR, BP).
Individual variation between users.
Daily fluctuations (noise) in measurements.
Variability in how height, weight, and BMI data are reported (missing, static, or annually updated).
Occasional data outliers simulating measurement errors or extreme events.
How Many Rows Will It Generate?

Users: NUM_USERS = 129000
Days per User: ROWS_PER_USER = 1096 (Calculated for Jan 1, 2022, to Dec 31, 2024, inclusive)
Total Rows: 129,000 users * 1096 days/user = 141,384,000 rows (plus one header row).
Logic Behind Each Column:

The script generates data row-by-row and writes it to a CSV file (synthetic_health_data_10gb_2022-2024_varied_hw_bmi.csv). Here's the logic for each column:

user_id:
Logic: A unique identifier for each simulated person, formatted as user_XXXXXX (e.g., user_000000, user_128999). Generated sequentially during the initial user setup.
Static/Dynamic: Static per user.
age:
Logic: Calculated daily based on the current_date of the row and the user's birth_year (which was determined from their initial_age at the START_DATE). It increments correctly each year.
Static/Dynamic: Dynamic (changes yearly).
gender:
Logic: Assigned once per user during setup based on the GENDER_DISTRIBUTION probabilities ('Male': 48%, 'Female': 48%, 'Other': 4%). Used to influence height, weight, and base calorie generation.
Static/Dynamic: Static per user.
datestamp:
Logic: Represents the specific date for the row's data. Starts at START_DATE (2022-01-01) and increments by one day for each of the 1096 rows per user, correctly handling month lengths and leap years.
Static/Dynamic: Dynamic (changes daily).
height_inches:
Logic: Represents the user's height in inches.
Category: Determined randomly during user setup (15% chance of 'None', 25% 'Annual', 60% 'Static').
Generation (if not 'None'): Generated once per user during setup using a normal distribution centered around gender-specific averages (HEIGHT_BASE_MALE_IN, HEIGHT_BASE_FEMALE_IN) with added standard deviation. Clamped within HEIGHT_CLAMP_IN.
Value: Remains constant throughout the 3 years for a user if they have data. Output as an empty string ("") if the user is in the 'None' category.
Outliers: A very small chance (HW_BMI_OUTLIER_PROBABILITY) exists for this value to be replaced by an extreme high/low value for a single day, only if the user has height data.
Static/Dynamic: Static per user (unless an outlier occurs for a specific day).
weight_kg:
Logic: Represents the user's weight in kilograms.
Category: Same as height ('None', 'Annual', 'Static').
Generation (if not 'None'): An initial weight is generated once per user during setup using a normal distribution centered around the average weight for their specific gender and initial age decade (from AVG_WEIGHT_KG), plus standard deviation (WEIGHT_STD_DEV_KG). Clamped within WEIGHT_CLAMP_KG.
Value:
If 'None': Output as "".
If 'Static': Output is the constant initial_weight_kg.
If 'Annual': Starts at initial_weight_kg. On Jan 1st of 2023 and 2024, a pre-determined random amount (annual_weight_change_kg) is added (can be positive or negative). The output reflects this potentially changing value.
Outliers: A very small chance exists for this value to be replaced by an extreme high/low value for a single day, only if the user has weight data.
Static/Dynamic: Static ('Static' type), Annually Updated ('Annual' type), or Missing ('None').
bmi:
Logic: Body Mass Index ( kg/m ²).
Category: Inherited from H/W ('None', 'Annual', 'Static').
Calculation: Calculated based on the weight_kg and height_inches (converted to meters) for that specific day.
Value:
If 'None': Output as "".
If 'Static': A constant value calculated from static height and weight.
If 'Annual': Recalculated daily based on the potentially changing current_weight_kg and static height. Clamped within BMI_CLAMP.
Outliers: If a height or weight outlier occurs, BMI is recalculated based on the outlier value for that day.
Static/Dynamic: Static ('Static' type), Annually Updated ('Annual' type), or Missing ('None').
bp (Blood Pressure):
Logic: Systolic/Diastolic values (e.g., "125/83").
Base: An individual base BP (systolic and diastolic separately) is established per user, adjusted based on their initial age group using get_bp_age_factor.
Daily Value: Generated daily by adding random noise (BP_NOISE) to the user's base BP. A small additional daily age effect might be applied based on current_age. Values are clamped, and diastolic is ensured to be lower than systolic.
Outliers: Small chance (OUTLIER_PROBABILITY_VITALS) of being replaced by extreme high or low BP values.
Static/Dynamic: Dynamic (daily fluctuations).
heart_rate:
Logic: Average daily heart rate (beats per minute).
Base: An individual base HR is established per user, adjusted based on their initial age group using get_hr_age_factor.
Daily Value: Generated daily by adding random noise (HR_NOISE) to the user's current drifted base HR (current_hr_base). This base has a slow random drift applied daily. Values are clamped.
Outliers: Small chance of being replaced by an extreme high HR value.
Static/Dynamic: Dynamic (daily fluctuations + slow drift).
spo2 (Blood Oxygen Saturation):
Logic: Percentage of oxygen saturation.
Base: An individual base SpO2 is established per user, potentially adjusted slightly by initial age group.
Daily Value: Generated daily by adding random noise (SPO2_NOISE) to the user's base SpO2. Values are clamped. Less strongly age-dependent in this simulation.
Outliers: Small chance of being replaced by an extreme low SpO2 value.
Static/Dynamic: Dynamic (daily fluctuations).
ecg_avg:
Logic: An abstract representation of a daily average ECG metric (e.g., related to rhythm or overall rate).
Base: An individual base ECG metric is established per user, potentially linked to the initial HR base via get_hr_age_factor.
Daily Value: Generated daily by adding random noise (ECG_NOISE) to the ECG base, potentially influenced slightly by the deviation of the current HR from the user's baseline HR. Values are clamped.
Static/Dynamic: Dynamic (daily fluctuations).
body_temp_f:
Logic: Average daily body temperature in Fahrenheit.
Base: An individual base temperature is established per user.
Daily Value: Generated daily by adding random noise (TEMP_F_NOISE) to the user's current drifted base temperature (current_temp_f_base). This base has a slow random drift applied daily. Values are clamped.
Outliers: Small chance of being replaced by an extreme high (fever) or low temperature value.
Static/Dynamic: Dynamic (daily fluctuations + slow drift).
step_count:
Logic: Total steps taken during the day.
Base: An individual base activity level (average steps) is established per user, adjusted based on their initial age group using get_activity_age_factor.
Daily Value: Generated daily using a Poisson distribution centered around the user's base activity level, with additional multiplicative daily randomness (1 + random.uniform(-0.4, 0.6)). Values are clamped.
Outliers: Small chance of being replaced by an extremely high step count.
Static/Dynamic: Dynamic (daily fluctuations).
calories_burnt:
Logic: Estimated total calories burned during the day.
Base: An individual base calorie need (like BMR) is established per user, adjusted based on their initial age group using get_bmr_age_factor and gender.
Daily Value: Calculated as (age-adjusted base calories) + (calories from steps). Calories from steps are calculated as steps_val * random factor. Values are clamped.
Outliers: Small chance of being replaced by an extremely high or low calorie value.
Static/Dynamic: Dynamic (depends heavily on daily steps).
sleep_duration_hr:
Logic: Total sleep duration in hours.
Base: An individual base sleep duration is established per user, adjusted based on their initial age group using get_sleep_dur_age_factor.
Daily Value: Generated daily by adding random noise (SLEEP_DUR_NOISE) to the user's base duration. Values are clamped.
Static/Dynamic: Dynamic (daily fluctuations).
sleep_quality_score:
Logic: A subjective score (e.g., 1-10) representing sleep quality.
Base: An individual base sleep quality score is established per user, adjusted based on their initial age group using get_sleep_qual_age_factor.
Daily Value: Generated daily by adding random noise (SLEEP_QUAL_NOISE) to the user's base quality. A simple correlation is added: slightly higher quality if sleep duration is near optimal (6.5-8.5 hrs), slightly lower otherwise. Values are clamped.
Static/Dynamic: Dynamic (daily fluctuations).


In [None]:
# -*- coding: utf-8 -*-
import csv
import numpy as np
# import pandas as pd # Not strictly needed for this CSV version
from datetime import datetime, timedelta, date
import random
import os
from tqdm import tqdm # Progress bar library (install: pip install tqdm)
import time # To time the execution

# --- Configuration for 10GB Dataset ---
NUM_USERS = 129000 # Keep the corrected number for ~10GB

START_YEAR = 2022
END_YEAR = 2024
# --- Updated Filename ---
OUTPUT_FILE_CSV = f'synthetic_health_data_10gb_{START_YEAR}-{END_YEAR}_varied_hw_bmi.csv'

# --- Date Calculation ---
START_DATE = date(START_YEAR, 1, 1)
END_DATE = date(END_YEAR, 12, 31)
ROWS_PER_USER = (END_DATE - START_DATE).days + 1 # 1096 days

# --- Base Parameters ---
INITIAL_AGE_RANGE = (18, 85)
GENDER_DISTRIBUTION = ['Male', 'Female', 'Other']
GENDER_PROBS = [0.48, 0.48, 0.04]

# --- User Category Probabilities for H/W/BMI ---
PROB_NO_HW_DATA = 0.15      # 15% users have no H/W/BMI data
PROB_ANNUAL_WEIGHT_UPDATE = 0.25 # 25% users update weight annually
# Remaining (1 - 0.15 - 0.25 = 0.60) will have static H/W/BMI data

# --- Height Parameters (Static per User, if recorded) ---
HEIGHT_BASE_MALE_IN = (69.0, 3.5)
HEIGHT_BASE_FEMALE_IN = (64.0, 3.0)
HEIGHT_CLAMP_IN = (55, 80)

# --- Weight Parameters (Based on Age/Gender Averages, if recorded) ---
WEIGHT_STD_DEV_KG = 12.0
AVG_WEIGHT_KG = {
    (20, 29): (74.9, 85.5), (30, 39): (79.3, 94.3), (40, 49): (80.8, 93.9),
    (50, 59): (78.7, 91.9), (60, 69): (78.2, 91.3), (70, 79): (74.7, 87.7),
    (80, 99): (67.9, 80.5)
}
WEIGHT_CLAMP_KG = (40, 180)
BMI_CLAMP = (15, 50)
# For annual updates:
ANNUAL_WEIGHT_CHANGE_MEAN_KG = 0.1
ANNUAL_WEIGHT_CHANGE_STD_KG = 1.0

# --- Physiological Parameters (Adjusted by Age Group) ---
# (Same as before)
INITIAL_BP_SYSTOLIC_BASE = (118, 12); INITIAL_BP_DIASTOLIC_BASE = (78, 8)
INITIAL_HEART_RATE_BASE = (68, 8); INITIAL_SPO2_BASE = (97.5, 1.0)
INITIAL_ECG_METRIC_BASE = (68, 10); INITIAL_BODY_TEMP_F_BASE = (98.2, 0.6)
INITIAL_SLEEP_DURATION_BASE = (7.6, 1.0); INITIAL_SLEEP_QUALITY_BASE = (7.5, 1.2)
INITIAL_ACTIVITY_LEVEL_BASE_MEAN = 6500; INITIAL_ACTIVITY_LEVEL_BASE_STD = 1800
INITIAL_BASE_CALORIES_MEAN_F = 1600; INITIAL_BASE_CALORIES_MEAN_M = 1900
INITIAL_BASE_CALORIES_STD = 200

# --- Age Adjustment Factor Functions (Same as before) ---
def get_bp_age_factor(age): return 1.0 + max(0, (age - 40) * 0.003)
def get_hr_age_factor(age): return 1.0 + (age - 75) * 0.002 if age > 75 else 1.0
def get_activity_age_factor(age):
    if age <= 30: return 1.0
    elif age <= 65: return 1.0 - (age - 30) * 0.011
    else: return (1.0 - (65 - 30) * 0.011) - (age - 65) * 0.018
def get_bmr_age_factor(age): return max(0.7, 1.0 - max(0, (age - 30) * 0.004))
def get_sleep_dur_age_factor(age): return 1.0 - max(0, (age - 55) * 0.002)
def get_sleep_qual_age_factor(age): return 1.0 - max(0, (age - 50) * 0.004)

# Daily variation/noise (Same as before)
BP_NOISE = (6, 4); HR_NOISE = 5; SPO2_NOISE = 0.5; ECG_NOISE = 6; TEMP_F_NOISE = 0.4
SLEEP_DUR_NOISE = 0.8; SLEEP_QUAL_NOISE = 1.0; STEPS_NOISE_FACTOR = 0.4
ACTIVITY_CALORIES_PER_STEP = (0.04, 0.055)

# Clamping ranges (Same as before)
BP_CLAMP = (85, 190, 55, 125); HR_CLAMP = (40, 170); SPO2_CLAMP = (92.0, 100.0)
ECG_CLAMP = (40, 170); TEMP_F_CLAMP = (96.0, 101.0); STEPS_CLAMP = (0, 40000)
CALORIES_CLAMP = (800, 7000); SLEEP_DUR_CLAMP = (3.0, 11.0); SLEEP_QUAL_CLAMP = (2.0, 10.0)

# --- Outlier Configuration ---
OUTLIER_PROBABILITY_VITALS = 0.0015 # For non-H/W/BMI metrics
HW_BMI_OUTLIER_PROBABILITY = 0.0003 # Lower probability for H/W/BMI outliers

# Outlier Ranges for Vitals
OUTLIER_LOW_CALORIES_RANGE = (700, 1000); OUTLIER_HIGH_CALORIES_RANGE = (4500, 6500) # Example only
OUTLIER_HR_RANGE = (175, 210); OUTLIER_LOW_SPO2_RANGE = (85.0, 91.0)
OUTLIER_HIGH_STEPS_RANGE = (45000, 60000); OUTLIER_HIGH_TEMP_F_RANGE = (101.5, 104.0)
OUTLIER_LOW_TEMP_F_RANGE = (93.0, 95.5); OUTLIER_WIDE_BP_RANGE = (190, 230, 115, 135)
OUTLIER_LOW_BP_RANGE = (70, 85, 40, 55)

# Outlier Ranges for H/W/BMI (applied only if user HAS data)
OUTLIER_HIGH_HEIGHT_IN = (81, 88); OUTLIER_LOW_HEIGHT_IN = (48, 54)
OUTLIER_HIGH_WEIGHT_KG = (181, 250); OUTLIER_LOW_WEIGHT_KG = (30, 39)

# --- Data Generation ---
print(f"--- Generating Synthetic Health Data (Target: ~10GB) ---")
print(f"Number of Users: {NUM_USERS}")
print(f"Rows per User: {ROWS_PER_USER} ({START_DATE.strftime('%Y-%m-%d')} to {END_DATE.strftime('%Y-%m-%d')})")
print(f"Estimated Total Rows: {NUM_USERS * ROWS_PER_USER:,}")
print(f"Output File (CSV): {OUTPUT_FILE_CSV}")
print(f"H/W/BMI Logic: {PROB_NO_HW_DATA*100:.0f}% None, {PROB_ANNUAL_WEIGHT_UPDATE*100:.0f}% Annual Update, {(1-PROB_NO_HW_DATA-PROB_ANNUAL_WEIGHT_UPDATE)*100:.0f}% Static")
print("This will take a significant amount of time...")

start_time = time.time()

# Generate static user data first
user_static_data = {}
print("\nGenerating initial static user info (determining H/W/BMI category)...")
for i in tqdm(range(NUM_USERS), desc="User Info Setup"):
    user_id = f"user_{i:06d}" # 6 digits for large user count
    initial_age = random.randint(*INITIAL_AGE_RANGE)
    birth_year = START_DATE.year - initial_age
    gender = random.choices(GENDER_DISTRIBUTION, weights=GENDER_PROBS, k=1)[0]

    # --- Determine User Category for H/W/BMI ---
    rand_hw = random.random()
    height_in = None
    initial_weight_kg = None
    static_bmi = None
    annual_weight_change_kg = 0.0
    update_type = 'None'

    if rand_hw < PROB_NO_HW_DATA: update_type = 'None'
    elif rand_hw < PROB_NO_HW_DATA + PROB_ANNUAL_WEIGHT_UPDATE: update_type = 'Annual'
    else: update_type = 'Static'

    # --- Generate H/W/BMI only if user is NOT 'None' category ---
    if update_type != 'None':
        # Generate STATIC Height
        if gender == 'Male': height_in = np.random.normal(HEIGHT_BASE_MALE_IN[0], HEIGHT_BASE_MALE_IN[1])
        elif gender == 'Other':
            avg_mean_height = (HEIGHT_BASE_MALE_IN[0] + HEIGHT_BASE_FEMALE_IN[0]) / 2
            avg_std_height = (HEIGHT_BASE_MALE_IN[1] + HEIGHT_BASE_FEMALE_IN[1]) / 2
            height_in = np.random.normal(avg_mean_height, avg_std_height)
        else: height_in = np.random.normal(HEIGHT_BASE_FEMALE_IN[0], HEIGHT_BASE_FEMALE_IN[1])
        height_in = np.clip(height_in, HEIGHT_CLAMP_IN[0], HEIGHT_CLAMP_IN[1])
        height_m = height_in * 0.0254

        # Generate Initial/Static Weight
        target_weight_mean_kg = 70
        for age_range, weights in AVG_WEIGHT_KG.items():
            if age_range[0] <= initial_age <= age_range[1]:
                if gender == 'Male': target_weight_mean_kg = weights[1]
                elif gender == 'Female': target_weight_mean_kg = weights[0]
                else: target_weight_mean_kg = (weights[0] + weights[1]) / 2
                break
        initial_weight_kg = np.random.normal(target_weight_mean_kg, WEIGHT_STD_DEV_KG)
        initial_weight_kg = np.clip(initial_weight_kg, WEIGHT_CLAMP_KG[0], WEIGHT_CLAMP_KG[1])

        # Calculate Initial/Static BMI
        if height_m > 0:
            static_bmi = initial_weight_kg / (height_m ** 2)
            static_bmi = np.clip(static_bmi, BMI_CLAMP[0], BMI_CLAMP[1])
        else: static_bmi = None # Handle potential zero height

        # If Annual Update, generate the annual change factor
        if update_type == 'Annual':
            annual_weight_change_kg = np.random.normal(ANNUAL_WEIGHT_CHANGE_MEAN_KG, ANNUAL_WEIGHT_CHANGE_STD_KG)

    # --- Determine Age Group and Adjust OTHER metrics ---
    # (Logic to calculate age-adjusted bases for HR, SpO2, activity, etc. remains the same)
    if 18 <= initial_age <= 39: age_group = 'Young'
    elif 40 <= initial_age <= 64: age_group = 'Middle'
    else: age_group = 'Senior'

    bp_age_factor = get_bp_age_factor(initial_age) # Example base adjustment
    hr_age_factor = get_hr_age_factor(initial_age)
    activity_age_factor = get_activity_age_factor(initial_age)
    bmr_age_factor = get_bmr_age_factor(initial_age)
    sleep_dur_age_factor = get_sleep_dur_age_factor(initial_age)
    sleep_qual_age_factor = get_sleep_qual_age_factor(initial_age)

    effective_sys_bp_base = INITIAL_BP_SYSTOLIC_BASE[0] * bp_age_factor # Adjust base mean
    effective_dia_bp_base = INITIAL_BP_DIASTOLIC_BASE[0] * bp_age_factor
    effective_hr_base = INITIAL_HEART_RATE_BASE[0] * hr_age_factor
    effective_activity_base = INITIAL_ACTIVITY_LEVEL_BASE_MEAN * activity_age_factor
    base_cal_mean = (INITIAL_BASE_CALORIES_MEAN_F if gender=='Female' else INITIAL_BASE_CALORIES_MEAN_M) * bmr_age_factor
    effective_sleep_dur_base = INITIAL_SLEEP_DURATION_BASE[0] * sleep_dur_age_factor
    effective_sleep_qual_base = INITIAL_SLEEP_QUALITY_BASE[0] * sleep_qual_age_factor
    effective_ecg_base = INITIAL_ECG_METRIC_BASE[0] * hr_age_factor # Link ECG base to HR factor

    # Generate individual baseline from the age-adjusted effective base
    user_sys_bp_base = np.random.normal(effective_sys_bp_base, INITIAL_BP_SYSTOLIC_BASE[1])
    user_dia_bp_base = np.random.normal(effective_dia_bp_base, INITIAL_BP_DIASTOLIC_BASE[1])
    user_hr_base = np.random.normal(effective_hr_base, INITIAL_HEART_RATE_BASE[1])
    user_activity_base = max(1000, np.random.normal(effective_activity_base, INITIAL_ACTIVITY_LEVEL_BASE_STD * activity_age_factor)) # Scale std dev too
    user_base_calories = np.random.normal(base_cal_mean, INITIAL_BASE_CALORIES_STD * bmr_age_factor) # Scale std dev
    user_sleep_dur_base = np.random.normal(effective_sleep_dur_base, INITIAL_SLEEP_DURATION_BASE[1])
    user_sleep_qual_base = np.random.normal(effective_sleep_qual_base, INITIAL_SLEEP_QUALITY_BASE[1])
    # These might not need age factors directly
    user_spo2_base = np.random.normal(INITIAL_SPO2_BASE[0], INITIAL_SPO2_BASE[1])
    user_temp_f_base = np.random.normal(INITIAL_BODY_TEMP_F_BASE[0], INITIAL_BODY_TEMP_F_BASE[1])
    user_ecg_base = np.random.normal(effective_ecg_base, INITIAL_ECG_METRIC_BASE[1])


    # --- Store all data ---
    user_static_data[user_id] = {
        'birth_year': birth_year, 'gender': gender,
        'height_inches': height_in, 'initial_weight_kg': initial_weight_kg,
        'static_bmi': static_bmi, 'update_type': update_type,
        'annual_weight_change_kg': annual_weight_change_kg,
        'initial_age_group': age_group,
        # Store the generated individual bases
        'user_sys_bp_base': user_sys_bp_base, 'user_dia_bp_base': user_dia_bp_base,
        'user_hr_base': user_hr_base, 'user_spo2_base': user_spo2_base,
        'user_temp_f_base': user_temp_f_base, 'user_ecg_base': user_ecg_base,
        'user_activity_base': user_activity_base, 'user_base_calories': user_base_calories,
        'user_sleep_dur_base': user_sleep_dur_base, 'user_sleep_qual_base': user_sleep_qual_base,
    }


print("\nStarting data generation and writing to CSV...")

# Open CSV file and write header
try:
    with open(OUTPUT_FILE_CSV, 'w', newline='', encoding='utf-8') as csvfile:
        # --- Update fieldnames ---
        fieldnames = [
            'user_id', 'age', 'gender', 'datestamp', 'height_inches', 'weight_kg', 'bmi', # Added H/W/BMI
            'bp', 'heart_rate', 'spo2', 'ecg_avg', 'body_temp_f', 'step_count',
            'calories_burnt', 'sleep_duration_hr', 'sleep_quality_score'
        ]
        writer = csv.writer(csvfile)
        writer.writerow(fieldnames)

        total_rows_written = 0

        # Loop through users and generate time series data row by row
        for user_id, static_info in tqdm(user_static_data.items(), desc="Writing User Data", total=NUM_USERS):
            birth_year = static_info['birth_year']
            gender = static_info['gender']
            update_type = static_info['update_type']

            # Retrieve H/W/BMI info
            height_in = static_info['height_inches']
            initial_weight = static_info['initial_weight_kg']
            static_bmi_val = static_info['static_bmi']
            annual_change = static_info['annual_weight_change_kg']
            height_m = height_in * 0.0254 if height_in is not None else 0

            # Initialize current weight for annual updates
            current_weight_kg = initial_weight if initial_weight is not None else None

            # Retrieve individual bases for fluctuating metrics
            sys_bp_base = static_info['user_sys_bp_base']; dia_bp_base = static_info['user_dia_bp_base']
            hr_base = static_info['user_hr_base']; spo2_base = static_info['user_spo2_base']
            temp_f_base = static_info['user_temp_f_base']; ecg_base = static_info['user_ecg_base']
            activity_base = static_info['user_activity_base']; base_calories = static_info['user_base_calories']
            sleep_dur_base = static_info['user_sleep_dur_base']; sleep_qual_base = static_info['user_sleep_qual_base']

            # Minimal daily drift for HR/Temp base
            hr_drift_rate = np.random.normal(0, 0.0005) # Adjust drift sensitivity if needed
            temp_f_drift_rate = np.random.normal(0, 0.0001)
            current_hr_base = hr_base
            current_temp_f_base = temp_f_base

            current_date = START_DATE

            for i in range(ROWS_PER_USER):
                current_age = current_date.year - birth_year

                # --- Handle H/W/BMI based on update_type ---
                output_height = ""
                output_weight = ""
                output_bmi = ""
                temp_height_for_outlier = height_in
                temp_weight_for_outlier = None
                temp_bmi_for_outlier = None

                if update_type == 'Static':
                    output_height = f"{height_in:.1f}" if height_in is not None else ""
                    output_weight = f"{initial_weight:.1f}" if initial_weight is not None else ""
                    output_bmi = f"{static_bmi_val:.1f}" if static_bmi_val is not None else ""
                    temp_weight_for_outlier = initial_weight
                    temp_bmi_for_outlier = static_bmi_val
                elif update_type == 'Annual':
                    if i > 0 and current_date.day == 1 and current_date.month == 1 and current_weight_kg is not None:
                        current_weight_kg += annual_change
                        current_weight_kg = np.clip(current_weight_kg, WEIGHT_CLAMP_KG[0], WEIGHT_CLAMP_KG[1])
                    current_bmi = None
                    if current_weight_kg is not None and height_m > 0:
                        current_bmi = current_weight_kg / (height_m ** 2)
                        current_bmi = np.clip(current_bmi, BMI_CLAMP[0], BMI_CLAMP[1])
                    output_height = f"{height_in:.1f}" if height_in is not None else ""
                    output_weight = f"{current_weight_kg:.1f}" if current_weight_kg is not None else ""
                    output_bmi = f"{current_bmi:.1f}" if current_bmi is not None else ""
                    temp_weight_for_outlier = current_weight_kg
                    temp_bmi_for_outlier = current_bmi
                # Else ('None'), outputs remain ""

                # --- Generate Daily Values for OTHER fluctuating metrics ---
                # Generate based on the individual's base value + noise
                # Apply *current* age factor adjustments where logical (e.g., BP might still have a daily age effect)
                bp_age_factor_sys = max(0, (current_age - 40) * 0.01) # Smaller daily age effect on top of base? Or remove?
                bp_age_factor_dia = max(0, (current_age - 40) * 0.005)
                sys_bp = np.random.normal(sys_bp_base + bp_age_factor_sys, BP_NOISE[0]) # BP base already age-adjusted
                dia_bp = np.random.normal(dia_bp_base + bp_age_factor_dia, BP_NOISE[1])
                sys_bp = np.clip(sys_bp, BP_CLAMP[0], BP_CLAMP[1])
                dia_bp = np.clip(dia_bp, BP_CLAMP[2], BP_CLAMP[3])
                if dia_bp >= sys_bp: dia_bp = max(BP_CLAMP[2], sys_bp - random.uniform(10, 40))

                hr_val = np.random.normal(current_hr_base, HR_NOISE) # Use drifted base
                hr_val = np.clip(hr_val, HR_CLAMP[0], HR_CLAMP[1])
                spo2_val = np.random.normal(spo2_base, SPO2_NOISE) # Use static base
                spo2_val = np.clip(spo2_val, SPO2_CLAMP[0], SPO2_CLAMP[1])
                ecg_val = np.random.normal(ecg_base + (current_hr_base - hr_base)*0.5, ECG_NOISE) # Link to HR deviation?
                ecg_val = np.clip(ecg_val, ECG_CLAMP[0], ECG_CLAMP[1])
                temp_val = np.random.normal(current_temp_f_base, TEMP_F_NOISE) # Use drifted base
                temp_val = np.clip(temp_val, TEMP_F_CLAMP[0], TEMP_F_CLAMP[1])
                sleep_h_val = np.random.normal(sleep_dur_base, SLEEP_DUR_NOISE) # Use static base
                sleep_h_val = np.clip(sleep_h_val, SLEEP_DUR_CLAMP[0], SLEEP_DUR_CLAMP[1])
                sleep_q_val = np.random.normal(sleep_qual_base, SLEEP_QUAL_NOISE) # Use static base
                if 6.5 < sleep_h_val < 8.5: sleep_q_val += random.uniform(0, 0.8)
                else: sleep_q_val -= random.uniform(0, 0.8)
                sleep_q_val = np.clip(sleep_q_val, SLEEP_QUAL_CLAMP[0], SLEEP_QUAL_CLAMP[1])
                steps_val = int(np.clip(np.random.poisson(activity_base * (1 + random.uniform(-0.4, 0.6))), STEPS_CLAMP[0], STEPS_CLAMP[1])) # Use static base
                # Optional: Add tiny daily age factor to base calories if desired
                daily_cal_age_factor = 1.0 # Simplification: base calories already age-adjusted
                calories_val = (base_calories * daily_cal_age_factor) + steps_val * random.uniform(*ACTIVITY_CALORIES_PER_STEP)
                calories_val = np.clip(calories_val, CALORIES_CLAMP[0], CALORIES_CLAMP[1])


                # --- Introduce Outliers ---
                # Vital Outliers
                if random.random() < OUTLIER_PROBABILITY_VITALS: calories_val = random.uniform(*OUTLIER_LOW_CALORIES_RANGE) if random.random() < 0.5 else random.uniform(*OUTLIER_HIGH_CALORIES_RANGE)
                if random.random() < OUTLIER_PROBABILITY_VITALS: hr_val = random.uniform(*OUTLIER_HR_RANGE)
                if random.random() < OUTLIER_PROBABILITY_VITALS: spo2_val = random.uniform(*OUTLIER_LOW_SPO2_RANGE)
                if random.random() < OUTLIER_PROBABILITY_VITALS: steps_val = int(random.uniform(*OUTLIER_HIGH_STEPS_RANGE))
                if random.random() < OUTLIER_PROBABILITY_VITALS: temp_val = random.uniform(*OUTLIER_HIGH_TEMP_F_RANGE) if random.random() < 0.8 else random.uniform(*OUTLIER_LOW_TEMP_F_RANGE)
                if random.random() < OUTLIER_PROBABILITY_VITALS:
                    if random.random() < 0.7: sys_bp, dia_bp = random.uniform(OUTLIER_WIDE_BP_RANGE[0], OUTLIER_WIDE_BP_RANGE[1]), random.uniform(OUTLIER_WIDE_BP_RANGE[2], OUTLIER_WIDE_BP_RANGE[3])
                    else: sys_bp, dia_bp = random.uniform(OUTLIER_LOW_BP_RANGE[0], OUTLIER_LOW_BP_RANGE[1]), random.uniform(OUTLIER_LOW_BP_RANGE[2], OUTLIER_LOW_BP_RANGE[3])
                    if dia_bp >= sys_bp: dia_bp = max(OUTLIER_LOW_BP_RANGE[2], sys_bp - random.uniform(10, 20))

                # H/W/BMI Outliers (only if user has data)
                if update_type != 'None' and random.random() < HW_BMI_OUTLIER_PROBABILITY:
                    outlier_choice = random.choice(['height', 'weight'])
                    if outlier_choice == 'height' and temp_height_for_outlier is not None:
                        temp_height_for_outlier = random.uniform(*OUTLIER_LOW_HEIGHT_IN) if random.random() < 0.5 else random.uniform(*OUTLIER_HIGH_HEIGHT_IN)
                        output_height = f"{temp_height_for_outlier:.1f}"
                        if temp_weight_for_outlier is not None:
                            new_height_m = temp_height_for_outlier * 0.0254
                            if new_height_m > 0:
                                 temp_bmi_for_outlier = temp_weight_for_outlier / (new_height_m ** 2)
                                 temp_bmi_for_outlier = np.clip(temp_bmi_for_outlier, BMI_CLAMP[0], BMI_CLAMP[1])
                                 output_bmi = f"{temp_bmi_for_outlier:.1f}" if temp_bmi_for_outlier is not None else ""
                    elif outlier_choice == 'weight' and temp_weight_for_outlier is not None:
                        temp_weight_for_outlier = random.uniform(*OUTLIER_LOW_WEIGHT_KG) if random.random() < 0.5 else random.uniform(*OUTLIER_HIGH_WEIGHT_KG)
                        output_weight = f"{temp_weight_for_outlier:.1f}"
                        if height_m > 0:
                             temp_bmi_for_outlier = temp_weight_for_outlier / (height_m ** 2)
                             temp_bmi_for_outlier = np.clip(temp_bmi_for_outlier, BMI_CLAMP[0], BMI_CLAMP[1])
                             output_bmi = f"{temp_bmi_for_outlier:.1f}" if temp_bmi_for_outlier is not None else ""


                # Format final values
                bp_val = f"{int(round(sys_bp))}/{int(round(dia_bp))}"

                # --- Format data for CSV row ---
                row = [
                    user_id, current_age, gender, current_date.strftime('%Y-%m-%d'),
                    output_height, # H/W/BMI now included
                    output_weight,
                    output_bmi,
                    bp_val, f"{hr_val:.1f}", f"{spo2_val:.1f}", f"{ecg_val:.1f}",
                    f"{temp_val:.1f}", steps_val, f"{calories_val:.0f}",
                    f"{sleep_h_val:.1f}", f"{sleep_q_val:.1f}"
                ]
                writer.writerow(row)
                total_rows_written += 1

                # Increment date
                current_date += timedelta(days=1)

                # Apply slow drift to HR/Temp base
                current_hr_base += hr_drift_rate
                current_temp_f_base += temp_f_drift_rate
                current_hr_base = np.clip(current_hr_base, HR_CLAMP[0]*0.95, HR_CLAMP[1]*0.9) # Clamp drifted bases
                current_temp_f_base = np.clip(current_temp_f_base, TEMP_F_CLAMP[0]*1.01, TEMP_F_CLAMP[1]*0.99)

except Exception as e:
    print(f"\n--- ERROR during generation or writing: {e} ---")
    # Print traceback for more details
    import traceback
    traceback.print_exc()
    print(f"Data written up to this point is in {OUTPUT_FILE_CSV}")

finally:
    end_time = time.time()
    duration = end_time - start_time
    print(f"\n--- Finished Data Generation ---")
    print(f"Total rows written: {total_rows_written:,}")
    print(f"Output file: {OUTPUT_FILE_CSV}")
    print(f"Total time: {duration:.2f} seconds ({duration/60:.2f} minutes)")

    # Final file size check
    try:
        file_size_bytes = os.path.getsize(OUTPUT_FILE_CSV)
        file_size_gb = file_size_bytes / (1024**3)
        print(f"Final file size: {file_size_gb:.2f} GB")
        if abs(file_size_gb - 10) > 0.75:
             print(f"\nNote: File size ({file_size_gb:.2f}GB) deviates slightly from the 10GB target.")
             if file_size_gb > 0:
                 suggested_num_users = int(NUM_USERS * (10 / file_size_gb))
                 print(f"Suggested new NUM_USERS ≈ {suggested_num_users}")
             else:
                 print("Could not suggest NUM_USERS adjustment as file size is zero.")
    except FileNotFoundError:
        print(f"Error: Output file {OUTPUT_FILE_CSV} not found.")
    except Exception as e:
        print(f"Error checking file size: {e}")

--- Generating Synthetic Health Data (Target: ~10GB) ---
Number of Users (Corrected): 129000
Rows per User: 1096 (2022-01-01 to 2024-12-31)
Estimated Total Rows: 141,384,000
Output File (CSV): synthetic_health_data_10gb_2022-2024.csv
Recalculation based on sample size: 30 users (32,880 rows) = 2.39 MB.
This will take a significant amount of time...

Generating initial static user info...


User Info Setup: 100%|██████████| 129000/129000 [00:02<00:00, 55226.96it/s]



Starting data generation and writing to CSV...


Writing User Data:  34%|███▍      | 43737/129000 [1:20:37<2:37:09,  9.04it/s]


--- Finished Data Generation ---
Total rows written: 47,936,349
Output file: synthetic_health_data_10gb_2022-2024.csv
Total time: 4839.59 seconds (80.66 minutes)
Final file size: 3.49 GB

Note: File size (3.49GB) deviates slightly from the 10GB target.
Consider adjusting NUM_USERS (currently 129000) proportionally for future runs if needed.
Suggested new NUM_USERS ≈ 369775





KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from google.colab import files
uploaded = files.upload()

# After upload, read the CSV
df = pd.read_csv('synthetic_health_data_2022-2024_varied_hw_bmi.csv')



Saving synthetic_health_data_2022-2024_varied_hw_bmi.csv to synthetic_health_data_2022-2024_varied_hw_bmi (1).csv


In [None]:
df.isnull().sum()

Unnamed: 0,0
user_id,0
age,0
gender,0
datestamp,0
height_inches,4384
weight_kg,4384
bmi,4384
bp,0
heart_rate,0
spo2,0


In [None]:
df_cleaned = df.dropna(subset=['height_inches', 'weight_kg', 'bmi'])


In [None]:
df.duplicated().sum()


np.int64(0)

In [None]:
# Quick overview of data types and non-null counts
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32880 entries, 0 to 32879
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user_id              32880 non-null  object 
 1   age                  32880 non-null  int64  
 2   gender               32880 non-null  object 
 3   datestamp            32880 non-null  object 
 4   height_inches        28496 non-null  float64
 5   weight_kg            28496 non-null  float64
 6   bmi                  28496 non-null  float64
 7   bp                   32880 non-null  object 
 8   heart_rate           32880 non-null  float64
 9   spo2                 32880 non-null  float64
 10  ecg_avg              32880 non-null  float64
 11  body_temp_f          32880 non-null  float64
 12  step_count           32880 non-null  int64  
 13  calories_burnt       32880 non-null  int64  
 14  sleep_duration_hr    32880 non-null  float64
 15  sleep_quality_score  32880 non-null 

**Feature Engineering** for Health Trend Analysis
Your project tasks said you need to find tre

In [None]:
# Split BP into systolic and diastolic
df[['bp_systolic', 'bp_diastolic']] = df['bp'].str.split('/', expand=True)
df['bp_systolic'] = df['bp_systolic'].astype(float)
df['bp_diastolic'] = df['bp_diastolic'].astype(float)


Create health risk flags (labels)

In [None]:
# Flag for hypertension
df['hypertension_flag'] = ((df['bp_systolic'] > 140) | (df['bp_diastolic'] > 90)).astype(int)


In [None]:
# Flag for obesity risk
df['obesity_flag'] = (df['bmi'] >= 30).astype(int)

# Flag for low oxygen
df['low_spo2_flag'] = (df['spo2'] < 95).astype(int)


In [None]:
# Convert datestamp to datetime
df['datestamp'] = pd.to_datetime(df['datestamp'])

# Create new features
df['year'] = df['datestamp'].dt.year
df['month'] = df['datestamp'].dt.month
df['day'] = df['datestamp'].dt.day
df['weekday'] = df['datestamp'].dt.weekday  # Monday = 0
df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)


In [None]:
# Calories burnt per 1000 steps
df['calories_per_1000_steps'] = (df['calories_burnt'] / (df['step_count'] + 1)) * 1000


In [None]:
df['fever_flag'] = (df['body_temp_f'] > 100.4).astype(int)


Features which are important:

**Feature	How/Why?**


1.  hypertension_flag -	Label to easily group risky users
2.  obesity_flag	- Detect sedentary/obesity risk
3.  low_spo2_flag	- Early respiratory issue detection
4.  year, month, weekday, is_weekend	- Time trend analysis
5.  calories_per_1000_steps	- Sedentary lifestyle marker
6.  fever_flag	- Acute illness detection
7. bp_systolic, bp_diastolic -Needed to detect hypertension





Step 1: Dataset Ingestion & Parquet Conversion

In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv("synthetic_health_data_2022-2024_varied_hw_bmi.csv")

# Fix height outlier (49 -> 60)
df.loc[df['height_inches'] == 49, 'height_inches'] = 60

# Convert datestamp to datetime
df['datestamp'] = pd.to_datetime(df['datestamp'])

# Split BP into systolic/diastolic
df[['systolic_bp', 'diastolic_bp']] = df['bp'].str.split('/', expand=True).astype(int)

# Save cleaned CSV
df.to_csv("cleaned_health_data.csv", index=False)

1.2 Convert to Parquet (Optimized for Spark)


In [None]:
df.to_parquet("cleaned_health_data.parquet")

Step 2: Transfer Data to Hadoop (HDFS)