In [1]:
import sys
print(sys.executable)

/opt/anaconda3/bin/python


In [3]:
!{sys.executable} -m pip install faker



In [7]:
# Paste entire block into a Jupyter Notebook cell and run (Shift+Enter).
# Dependencies: only numpy and pandas.
import numpy as np
import pandas as pd
import math
from datetime import timedelta, date
import getpass

# ---------------------------
# Configurable inputs (ask user)
# ---------------------------
print("Synthetic Wellness Data Generator")
n_profiles = int(input("Enter number of profiles (e.g. 10, 100, 10000): ") or "10")
days_per_profile = int(input("Enter days of history per profile (e.g. 14): ") or "14")
save_csv = input("Save CSV? (y/N): ").strip().lower() == 'y'
csv_name = input("CSV filename (default synthetic_wellness.csv): ").strip() or "synthetic_wellness.csv"
seed = int(input("Random seed (default 42): ") or "42")

np.random.seed(seed)

# ---------------------------
# Utility helpers
# ---------------------------
def clip01(x):
    return max(0.0, min(1.0, x))

def lnorm(x, denom=10000.0):
    """log1p normalization wrt denom (diminishing returns). returns 0..~1"""
    return math.log1p(x) / math.log1p(denom) if denom>0 else 0.0

def linear_scale(x, lo, hi):
    """Scale x in [lo,hi] to [0,1]"""
    if np.isnan(x): return np.nan
    if hi==lo: return 0.0
    return clip01((x - lo) / (hi - lo))

# ---------------------------
# Profile generation (generic distribution)
# ---------------------------
def create_profiles(n_profiles, seed=42):
    rng = np.random.default_rng(seed)
    profiles = []
    for i in range(n_profiles):
        sex = rng.choice(['male','female'])
        age = int(rng.integers(18, 70))
        # heights/weights by sex approximate population distributions
        if sex=='male':
            height_cm = float(np.round(rng.normal(175, 7),1))
            weight_kg = float(np.round(rng.normal(80, 12),1))
        else:
            height_cm = float(np.round(rng.normal(162, 7),1))
            weight_kg = float(np.round(rng.normal(68, 11),1))
        height_cm = float(np.clip(height_cm, 140, 210))
        weight_kg = float(np.clip(weight_kg, 40, 160))
        bmi = round(weight_kg / ((height_cm/100.0)**2), 2)
        # activity level baseline (1 sedentary, 2 moderate, 3 active)
        activity_level = int(rng.choice([1,2,3], p=[0.3,0.5,0.2]))
        # smoking & vaping prevalence generic
        smoker = bool(rng.random() < 0.18)   # ~18% smokers
        vaper = bool(rng.random() < 0.05)    # ~5% vapers
        # chronic predisposition factor (affects baseline stress)
        chronic_factor = float(np.round(rng.normal(1.0, 0.12),3))
        chronic_factor = float(np.clip(chronic_factor, 0.6, 1.5))
        # medicine flag (0 none, 1 antidepressant, 2 stimulant, etc)
        med = int(rng.choice([0,0,0,1,2], p=[0.8,0.05,0.05,0.05,0.05]))
        profiles.append({
            'user_id': f'user_{i}',
            'age': int(age),
            'sex': sex,
            'height_cm': height_cm,
            'weight_kg': weight_kg,
            'bmi': bmi,
            'activity_level': activity_level,
            'smoker': smoker,
            'vaper': vaper,
            'chronic_factor': chronic_factor,
            'medicine': med
        })
    return pd.DataFrame(profiles)

# ---------------------------
# Baseline modifiers (profile-level)
# ---------------------------
def baseline_modifiers(profile):
    """
    Compute profile baseline stress and mood from immutable attributes.
    Older age and higher BMI raise baseline stress; higher activity reduces it.
    """
    age = profile['age']
    bmi = profile['bmi']
    act = profile['activity_level']
    chronic = profile['chronic_factor']
    # Baseline stress (higher => worse). Center around 3-5 baseline.
    stress_base = 3.2 + 0.02*(age - 30)/10.0 + 0.5 * max(0.0, (bmi - 22)/5.0)
    stress_base -= 0.5 * (act - 2)   # active reduces baseline
    stress_base *= chronic
    # Baseline mood (higher => better). Center ~6.0
    mood_base = 6.0 + 0.15*(act - 2) - 0.03*(age - 30)/10.0 - 0.12 * max(0.0, (bmi - 22)/5.0)
    mood_base = float(np.clip(mood_base, 1.0, 9.0))
    return float(stress_base), float(mood_base)

# ---------------------------
# Daily effects math (science-informed)
# ---------------------------
def daily_effects(day, profile, features):
    """
    features is a dict potentially containing:
      steps, distance_km, exercise_min, exercise_intensity (0-1),
      sleep_hours, water_liters, calories, caffeine_mg,
      sunlight_min, aqi, noise_db, lighting_lux,
      time_outdoors_min, time_in_office_hr, sedentary_hours, screen_time_hr,
      social_min, weather_temp_C
    Returns (stress_delta, mood_delta, info_dict)
    """
    # defaults
    steps = float(features.get('steps', 0.0))
    exercise_min = float(features.get('exercise_min', 0.0))
    exercise_intensity = float(features.get('exercise_intensity', 0.0))
    sleep_hours = float(features.get('sleep_hours', 7.0))
    water_l = float(features.get('water_liters', 2.0))
    calories = float(features.get('calories', 2200))
    caffeine = float(features.get('caffeine_mg', 0.0))
    sunlight = float(features.get('sunlight_min', 60.0))
    aqi = float(features.get('aqi', 100.0))
    noise = float(features.get('noise_db', 40.0))
    lighting = float(features.get('lighting_lux', 300.0))
    time_outdoors = float(features.get('time_outdoors_min', 30.0))
    time_in_office = float(features.get('time_in_office_hr', 8.0))
    sedentary = float(features.get('sedentary_hours', 8.0))
    screen_time = float(features.get('screen_time_hr', 6.0))
    social = float(features.get('social_min', 30.0))
    temp = float(features.get('weather_temp_C', 20.0))
    smoker = bool(profile.get('smoker', False))
    vaper = bool(profile.get('vaper', False))

    # Transformations: diminishing returns via log1p or clipped linear maps
    steps_score = lnorm(steps, denom=10000.0)            # 0..~1
    exercise_score = lnorm(exercise_min * (0.5 + exercise_intensity), denom=120.0)
    sleep_score = clip01((sleep_hours - 4.0) / 6.0)      # 4-10h -> 0..1
    water_score = clip01((water_l - 0.5) / 3.5)          # 0.5-4.0L -> 0..1
    sunlight_score = lnorm(sunlight, denom=180.0)
    social_score = clip01(social / 120.0)
    sedentary_pen = clip01((sedentary - 4.0) / 12.0)
    aqi_pen = clip01(max(0.0, (aqi - 50.0)) / 200.0)    # >50 becomes penalty up to 1
    noise_pen = clip01((noise - 30.0) / 70.0)
    caffeine_effect = clip01(caffeine / 400.0)
    temp_pen = 0.0
    if temp < 5: temp_pen = 0.2
    elif temp > 30: temp_pen = 0.15

    # smoker/vaper acute penalty
    smoke_pen = 0.12 if smoker else 0.06 if vaper else 0.0

    # stress_delta: positive increases stress
    stress_delta = 0.0
    stress_delta += -1.3 * steps_score
    stress_delta += -1.0 * exercise_score
    stress_delta += -1.6 * sleep_score
    stress_delta += -1.0 * water_score
    stress_delta += -0.9 * social_score
    stress_delta += +1.5 * aqi_pen
    stress_delta += +1.0 * noise_pen
    stress_delta += +0.9 * caffeine_effect
    stress_delta += +0.8 * sedentary_pen
    stress_delta += +0.6 * clip01((time_in_office - 6.0) / 12.0)
    stress_delta += +1.0 * smoke_pen
    stress_delta += +0.4 * temp_pen
    # calorie extremes
    if calories < 1400: stress_delta += 0.5
    elif calories > 3500: stress_delta += 0.4

    # mood_delta: positive increases mood
    mood_delta = 0.0
    mood_delta += +1.2 * steps_score
    mood_delta += +1.6 * exercise_score
    mood_delta += +1.8 * sleep_score
    mood_delta += +1.0 * sunlight_score
    mood_delta += +0.9 * water_score
    mood_delta += +1.0 * social_score
    mood_delta += -1.3 * aqi_pen
    mood_delta += -1.0 * noise_pen
    mood_delta += -0.7 * caffeine_effect
    mood_delta += -1.1 * sedentary_pen
    mood_delta += -0.8 * clip01((time_in_office - 6.0) / 12.0)
    mood_delta += -1.0 * smoke_pen
    mood_delta += -0.3 * temp_pen
    # calorie moderation
    if 1800 <= calories <= 3000:
        mood_delta += 0.5
    else:
        mood_delta -= 0.4

    # lighting low penalizes mood
    if lighting < 150:
        mood_delta -= 0.25

    debug = {
        'steps_score': steps_score, 'exercise_score': exercise_score, 'sleep_score': sleep_score,
        'sunlight_score': sunlight_score, 'aqi_pen': aqi_pen, 'noise_pen': noise_pen,
        'water_score': water_score, 'sedentary_pen': sedentary_pen, 'smoke_pen': smoke_pen
    }

    return float(stress_delta), float(mood_delta), debug

# ---------------------------
# Discrete-time simulator for one profile
# ---------------------------
def simulate_profile(profile_row, days=14, rng=None, missing_rate=0.03):
    """
    Simulate 'days' for one profile. Returns list of dicts (rows).
    """
    if rng is None: rng = np.random.default_rng()

    stress_base, mood_base = baseline_modifiers(profile_row)
    prev_stress = stress_base
    prev_mood = mood_base

    rows = []
    start_date = date.today() - timedelta(days=days)

    for d in range(days):
        day_date = start_date + timedelta(days=d)
        # Sample daily features conditioned on profile
        act = profile_row['activity_level']
        sedentary_profile = profile_row['activity_level'] == 1

        # Steps: mean depends on activity and sedentary profile
        mean_steps = 3500 + (act-1)*2500
        if sedentary_profile: mean_steps *= 0.6
        steps = float(max(0.0, rng.normal(mean_steps, mean_steps*0.35)))
        distance_km = float(np.clip(steps/1300.0 + rng.normal(0,0.5), 0.0, 40.0))
        exercise_min = float(max(0.0, rng.normal(30 if act>=2 else 10, 20)))
        exercise_intensity = float(clip01(rng.normal(0.5 if act>=2 else 0.25, 0.2)))
        sleep_hours = float(np.clip(rng.normal(7.0 - 0.02*(profile_row['age']-30), 1.1), 3.0, 10.0))
        water_l = float(np.clip(rng.normal(2.0, 0.6), 0.2, 5.0))
        calories = float(np.clip(rng.normal(2200, 450), 1000, 5000))
        caffeine_mg = float(0 if rng.random() < 0.35 else abs(rng.normal(95, 40)))
        sunlight_min = float(np.clip(rng.normal(60, 50), 0, 300))
        aqi = float(np.clip(rng.normal(100, 40), 5, 350))
        noise_db = float(np.clip(rng.normal(45, 12), 20, 100))
        lighting_lux = float(np.clip(rng.normal(350, 180), 5, 2000))
        time_outdoors = float(np.clip(sunlight_min * rng.uniform(0.3, 1.0), 0, 600))
        time_in_office = float(np.clip(rng.normal(8 if act>=2 else 4, 3), 0, 16))
        sedentary_hours = float(np.clip(rng.normal(9 if sedentary_profile else 7, 2.0), 0, 16))
        screen_time_hr = float(np.clip(rng.normal(6, 2.5), 0, 18))
        social_min = float(np.clip(rng.normal(40, 50), 0, 240))
        weather_temp = float(np.clip(rng.normal(20, 8), -10, 45))

        features = {
            'steps': steps, 'distance_km': distance_km, 'exercise_min': exercise_min,
            'exercise_intensity': exercise_intensity, 'sleep_hours': sleep_hours,
            'water_liters': water_l, 'calories': calories, 'caffeine_mg': caffeine_mg,
            'sunlight_min': sunlight_min, 'aqi': aqi, 'noise_db': noise_db, 'lighting_lux': lighting_lux,
            'time_outdoors_min': time_outdoors, 'time_in_office_hr': time_in_office,
            'sedentary_hours': sedentary_hours, 'screen_time_hr': screen_time_hr,
            'social_min': social_min, 'weather_temp_C': weather_temp
        }

        # Compute deltas
        stress_delta, mood_delta, debug = daily_effects(d, profile_row, features)

        # Discrete-time coupling (Euler update)
        lambda_s = 0.35
        lambda_m = 0.28
        raw_stress_inst = baseline_modifiers(profile_row)[0] + stress_delta
        new_stress = prev_stress + lambda_s * (raw_stress_inst - prev_stress) + rng.normal(0, 0.3)
        new_stress = float(np.clip(new_stress, 1.0, 10.0))

        # Mood influenced by stress (higher stress lowers mood); coupling factor
        stress_influence = 0.6
        raw_mood_inst = baseline_modifiers(profile_row)[1] + mood_delta - stress_influence * (new_stress - 4.0)
        new_mood = prev_mood + lambda_m * (raw_mood_inst - prev_mood) + rng.normal(0, 0.4)
        new_mood = float(np.clip(new_mood, 1.0, 10.0))

        # Derived vitals (approx)
        heart_rate = float(np.clip(60 + 0.002*steps + 0.9*(new_stress-4.0) + (exercise_min/30.0)*4 + rng.normal(0,3), 40, 160))
        bp_sys = float(np.clip(110 + 2.0*(new_stress-4.0) + 0.01*caffeine_mg + rng.normal(0,6), 90, 200))
        bp_dia = float(np.clip(70 + 1.0*(new_stress-4.0) + rng.normal(0,4), 50, 120))
        spo2 = float(np.clip(98 - max(0,(aqi-120))/180 + rng.normal(0,0.3), 80, 100))

        # confidence metric: fraction of important features present (all present here)
        important_feats = ['steps','exercise_min','sleep_hours','water_liters','sunlight_min','aqi','sedentary_hours']
        present = sum(0 if pd.isna(features[k]) else 1 for k in important_feats)
        confidence = present / len(important_feats)

        row = {
            'user_id': profile_row['user_id'],
            'date': (date.today() - timedelta(days=days)) + timedelta(days=d),
            **features,
            'heart_rate': round(heart_rate,1), 'bp_sys': round(bp_sys,1), 'bp_dia': round(bp_dia,1), 'spo2': round(spo2,1),
            'stress': round(new_stress,2), 'mood': round(new_mood,2), 'confidence': round(confidence,3),
            'smoker': profile_row['smoker'], 'vaper': profile_row['vaper'], 'age': profile_row['age'], 'sex': profile_row['sex'],
            'bmi': profile_row['bmi'], 'activity_level': profile_row['activity_level'], 'medicine': profile_row['medicine']
        }

        # Random missingness injection
        if rng.random() < 0.03:
            # drop a few keys at random (simulate missing logs)
            possible_drop = list(features.keys())
            kdrop = rng.integers(1, max(2, int(0.05*len(possible_drop))))
            drop_keys = rng.choice(possible_drop, size=kdrop, replace=False)
            for dk in drop_keys:
                row[dk] = np.nan

        rows.append(row)
        prev_stress = new_stress
        prev_mood = new_mood

    return rows

# ---------------------------
# Top-level dataset generation
# ---------------------------
def generate_dataset(n_profiles=10, days_per_profile=14, seed=42, save_csv=False, csv_name="synthetic_wellness.csv"):
    rng = np.random.default_rng(seed)
    profiles = create_profiles(n_profiles, seed=seed)
    all_rows = []
    for _, prow in profiles.iterrows():
        rows = simulate_profile(prow.to_dict(), days=days_per_profile, rng=rng)
        all_rows.extend(rows)
    df = pd.DataFrame(all_rows)
    # reorder & fill
    cols_order = ['user_id','date','age','sex','bmi','activity_level','smoker','vaper','medicine',
                  'steps','distance_km','exercise_min','exercise_intensity','sleep_hours','water_liters',
                  'calories','caffeine_mg','sunlight_min','time_outdoors_min','time_in_office_hr',
                  'sedentary_hours','screen_time_hr','social_min','aqi','noise_db','lighting_lux',
                  'heart_rate','bp_sys','bp_dia','spo2','stress','mood','confidence']
    cols = [c for c in cols_order if c in df.columns] + [c for c in df.columns if c not in cols_order]
    df = df[cols]
    if save_csv:
        df.to_csv(csv_name, index=False)
        print(f"Saved CSV: {csv_name} ({len(df)} rows)")
    return df, profiles

# ---------------------------
# Run and show
# ---------------------------
df, profiles = generate_dataset(n_profiles=n_profiles, days_per_profile=days_per_profile, seed=seed, save_csv=save_csv, csv_name=csv_name)
print("Profiles summary (first 5):")
print(profiles.head().to_string(index=False))
print("\nGenerated data sample (first 8 rows):")
print(df.head(8).to_string(index=False))

# Basic diagnostics printed
print("\nBasic diagnostics:")
print("Total rows:", len(df))
print("Stress range:", df['stress'].min(), "-", df['stress'].max())
print("Mood range:", df['mood'].min(), "-", df['mood'].max())
print("Average steps:", round(df['steps'].mean(),1))
print("Average sleep hours:", round(df['sleep_hours'].mean(),2))

# If saved, show filename and path
if save_csv:
    print("\nCSV saved to:", csv_name)

# End of generator cell.

Synthetic Wellness Data Generator


Enter number of profiles (e.g. 10, 100, 10000):  100
Enter days of history per profile (e.g. 14):  100
Save CSV? (y/N):  y
CSV filename (default synthetic_wellness.csv):  
Random seed (default 42):  


Saved CSV: synthetic_wellness.csv (10000 rows)
Profiles summary (first 5):
user_id  age    sex  height_cm  weight_kg   bmi  activity_level  smoker  vaper  chronic_factor  medicine
 user_0   58   male      167.7       89.0 31.65               2    True  False           1.015         0
 user_1   24 female      156.0       77.7 31.93               3   False  False           1.056         0
 user_2   46   male      168.3       90.5 31.95               2   False  False           1.147         0
 user_3   58 female      159.5       73.9 29.05               1    True  False           1.257         2
 user_4   34   male      169.3       87.4 30.49               1    True  False           0.901         0

Generated data sample (first 8 rows):
user_id       date  age  sex   bmi  activity_level  smoker  vaper  medicine       steps  distance_km  exercise_min  exercise_intensity  sleep_hours  water_liters    calories  caffeine_mg  sunlight_min  time_outdoors_min  time_in_office_hr  sedentary_hours 