# Pill Dispenser Synthetic Data Generator (Colab-compatible)
This notebook sets up a Colab-style environment to generate synthetic pill dispenser datasets and save outputs (CSV/JSON). It also writes a standalone Python script (`generator.py`) that can be run with CLI arguments, plus basic unit tests.

In [None]:
# Section 1: Set Up Colab Environment
import sys, os, subprocess, json
import platform
print("Python version:", sys.version)
print("Platform:", platform.platform())

# Optional: mount Google Drive when in Colab
try:
    import google.colab
    from google.colab import drive
    IN_COLAB = True
    print("Running in Google Colab.")
    drive.mount('/content/drive')
except Exception:
    IN_COLAB = False
    print("Not running in Colab; skipping Drive mount.")

# Ensure dependencies (numpy, pandas, pytest) are available
def ensure_package(pkg):
    try:
        __import__(pkg)
        print(f"Package '{pkg}' already installed.")
    except ImportError:
        print(f"Installing '{pkg}'...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])

for p in ['numpy', 'pandas', 'pytest']:
    ensure_package(p)

In [None]:
# Section 2: Define Config Parameters
import numpy as np
import pandas as pd

DEFAULT_CONFIG = {
    'num_records': 1000,
    'random_seed': 42,
    'output_dir': 'outputs',
    'csv_filename': 'pill_events.csv',
    'json_filename': 'pill_events.json',

    # Field distributions
    'users': ['user_a', 'user_b', 'user_c'],
    'devices': ['PILL_DISPENSER_841aa6080814', 'PILL_DISPENSER_DEFAULT'],
    'containers': [1, 2, 3, 4, 5],
    'dosages': [1, 2, 3, 4, 5],
    'success_rate': 0.97,
    'manual_rate': 0.6,
    'schedule_names': ['Container 1 Medication', 'Container 2 Medication', 'Container 3 Medication'],
    'start_datetime': '2025-12-10 00:00:00',
    'end_datetime': '2025-12-15 00:00:00'
}

# Create output directory
os.makedirs(DEFAULT_CONFIG['output_dir'], exist_ok=True)

In [None]:
# Section 3: Implement Data Generation Functions
from datetime import datetime, timedelta

np.random.seed(DEFAULT_CONFIG['random_seed'])
start_dt = pd.to_datetime(DEFAULT_CONFIG['start_datetime'])
end_dt = pd.to_datetime(DEFAULT_CONFIG['end_datetime'])
total_seconds = int((end_dt - start_dt).total_seconds())

EVENT_TYPES = ['Manual dispense via web app', 'Scheduled dispense']

def random_timestamp():
    offset = np.random.randint(0, max(total_seconds, 1))
    return (start_dt + timedelta(seconds=int(offset))).strftime('%Y-%m-%d %H:%M:%S')

def generate_event(i):
    is_manual = np.random.rand() < DEFAULT_CONFIG['manual_rate']
    event_type = EVENT_TYPES[0] if is_manual else EVENT_TYPES[1]
    pills = int(np.random.choice(DEFAULT_CONFIG['dosages']))
    device_id = str(np.random.choice(DEFAULT_CONFIG['devices']))
    container = int(np.random.choice(DEFAULT_CONFIG['containers']))
    user_id = str(np.random.choice(DEFAULT_CONFIG['users']))
    success = np.random.rand() < DEFAULT_CONFIG['success_rate']

    # Description formatting similar to sample logs
    if event_type == 'Scheduled dispense':
        med_name = str(np.random.choice(DEFAULT_CONFIG['schedule_names']))
        desc = f"Scheduled dispense: Container {container} {med_name.split(' ', 2)[-1]}"
    else:
        desc = 'Manual dispense via web app'

    # Add fields useful for future confusion matrix
    expected_dispense = not is_manual  # scheduled implies expected event
    sensor_dispense_detected = success  # simplification for synthetic data

    return {
        'Datetime': random_timestamp(),
        'Pills': pills,
        'Description': desc,
        'Status': 'Success' if success else 'Failure',
        'Device ID': device_id,
        'User ID': user_id,
        'Container': container,
        'Event Type': event_type,
        'Expected Dispense': expected_dispense,
        'Sensor Dispense Detected': sensor_dispense_detected
    }

def generate_dataset(n):
    return pd.DataFrame([generate_event(i) for i in range(n)])

print("Data generation functions defined successfully.")

In [None]:
# Section 4: Write Generator as Python Script File
script_path = 'generator.py'
script_code = '''#!/usr/bin/env python3
import os, sys, json
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import argparse

DEFAULT_CONFIG = {
    'num_records': 1000,
    'random_seed': 42,
    'output_dir': 'outputs',
    'csv_filename': 'pill_events.csv',
    'json_filename': 'pill_events.json',
    'users': ['user_a', 'user_b', 'user_c'],
    'devices': ['PILL_DISPENSER_841aa6080814', 'PILL_DISPENSER_DEFAULT'],
    'containers': [1, 2, 3, 4, 5],
    'dosages': [1, 2, 3, 4, 5],
    'success_rate': 0.97,
    'manual_rate': 0.6,
    'schedule_names': ['Container 1 Medication', 'Container 2 Medication', 'Container 3 Medication'],
    'start_datetime': '2025-12-10 00:00:00',
    'end_datetime': '2025-12-15 00:00:00'
}

EVENT_TYPES = ['Manual dispense via web app', 'Scheduled dispense']

def random_timestamp(start_dt, end_dt):
    total_seconds = int((end_dt - start_dt).total_seconds())
    offset = np.random.randint(0, max(total_seconds, 1))
    return (start_dt + timedelta(seconds=int(offset))).strftime('%Y-%m-%d %H:%M:%S')


def generate_event(cfg):
    is_manual = np.random.rand() < cfg['manual_rate']
    event_type = EVENT_TYPES[0] if is_manual else EVENT_TYPES[1]
    pills = int(np.random.choice(cfg['dosages']))
    device_id = str(np.random.choice(cfg['devices']))
    container = int(np.random.choice(cfg['containers']))
    user_id = str(np.random.choice(cfg['users']))
    success = np.random.rand() < cfg['success_rate']

    if event_type == 'Scheduled dispense':
        med_name = str(np.random.choice(cfg['schedule_names']))
        desc = f"Scheduled dispense: Container {container} {med_name.split(' ', 2)[-1]}"
    else:
        desc = 'Manual dispense via web app'

    expected_dispense = not is_manual
    sensor_dispense_detected = success

    return {
        'Datetime': None,
        'Pills': pills,
        'Description': desc,
        'Status': 'Success' if success else 'Failure',
        'Device ID': device_id,
        'User ID': user_id,
        'Container': container,
        'Event Type': event_type,
        'Expected Dispense': expected_dispense,
        'Sensor Dispense Detected': sensor_dispense_detected
    }


def generate_dataset(cfg):
    np.random.seed(cfg['random_seed'])
    start_dt = pd.to_datetime(cfg['start_datetime'])
    end_dt = pd.to_datetime(cfg['end_datetime'])
    rows = []
    for i in range(cfg['num_records']):
        row = generate_event(cfg)
        row['Datetime'] = random_timestamp(start_dt, end_dt)
        rows.append(row)
    return pd.DataFrame(rows)


def main():
    parser = argparse.ArgumentParser(description='Synthetic pill dispenser event generator')
    parser.add_argument('--num-records', type=int, default=DEFAULT_CONFIG['num_records'])
    parser.add_argument('--output-dir', type=str, default=DEFAULT_CONFIG['output_dir'])
    parser.add_argument('--csv-filename', type=str, default=DEFAULT_CONFIG['csv_filename'])
    parser.add_argument('--json-filename', type=str, default=DEFAULT_CONFIG['json_filename'])
    parser.add_argument('--random-seed', type=int, default=DEFAULT_CONFIG['random_seed'])
    args = parser.parse_args()

    cfg = DEFAULT_CONFIG.copy()
    cfg['num_records'] = args.num_records
    cfg['output_dir'] = args.output_dir
    cfg['csv_filename'] = args.csv_filename
    cfg['json_filename'] = args.json_filename
    cfg['random_seed'] = args.random_seed

    os.makedirs(cfg['output_dir'], exist_ok=True)

    df = generate_dataset(cfg)

    csv_path = os.path.join(cfg['output_dir'], cfg['csv_filename'])
    json_path = os.path.join(cfg['output_dir'], cfg['json_filename'])

    df.to_csv(csv_path, index=False)
    df.to_json(json_path, orient='records', lines=False)
    print(f"Saved CSV to {csv_path}")
    print(f"Saved JSON to {json_path}")

    # Basic summary
    success_rate = (df['Status'] == 'Success').mean()
    manual_count = (df['Event Type'] == 'Manual dispense via web app').sum()
    scheduled_count = (df['Event Type'] == 'Scheduled dispense').sum()
    total_pills = df['Pills'].sum()
    print(f"Success rate: {success_rate:.2%}")
    print(f"Manual events: {manual_count}, Scheduled events: {scheduled_count}")
    print(f"Total pills dispensed: {total_pills}")

if __name__ == '__main__':
    main()
'''
with open(script_path, 'w', encoding='utf-8') as f:
    f.write(script_code)
print(f"Wrote script to {script_path}")

In [None]:
# Section 5: Run and Verify Script Output
!python generator.py --num-records 500 --output-dir outputs --csv-filename pill_events_500.csv --json-filename pill_events_500.json

# Load and preview
df = pd.read_csv(os.path.join('outputs', 'pill_events_500.csv'))
print(df.head())
print("\nSummary:")
print("Total events:", len(df))
print("Success rate:", (df['Status'] == 'Success').mean())
print("Manual vs Scheduled:", df['Event Type'].value_counts().to_dict())
print("Total pills dispensed:", df['Pills'].sum())

In [None]:
# Section 6: Save Outputs to Google Drive (optional)
if IN_COLAB:
    drive_dir = '/content/drive/MyDrive/pill_dispenser_outputs'
    os.makedirs(drive_dir, exist_ok=True)
    for fname in ['pill_events_500.csv', 'pill_events_500.json']:
        src = os.path.join('outputs', fname)
        dst = os.path.join(drive_dir, fname)
        !cp "$src" "$dst"
    print("Copied outputs to:", drive_dir)
else:
    print("Not in Colab; skipping Drive copy. Files in local 'outputs' folder.")

In [None]:
# Section 7: Add Basic Unit Tests
test_code = '''import pandas as pd

def test_schema_columns():
    df = pd.read_csv('outputs/pill_events_500.csv')
    required = {
        'Datetime','Pills','Description','Status','Device ID','User ID','Container','Event Type',
        'Expected Dispense','Sensor Dispense Detected'
    }
    assert required.issubset(df.columns), f"Missing columns: {required - set(df.columns)}"

def test_value_ranges():
    df = pd.read_csv('outputs/pill_events_500.csv')
    assert df['Pills'].between(1, 5).all(), "Pills outside expected range 1-5"
    assert df['Container'].between(1, 5).all(), "Container outside expected range 1-5"
    assert set(df['Status'].unique()) <= {'Success','Failure'}
    assert set(df['Event Type'].unique()) <= {'Manual dispense via web app','Scheduled dispense'}
'''
with open('test_generator.py', 'w', encoding='utf-8') as f:
    f.write(test_code)
print('Wrote test_generator.py')

!pytest -q

## Next Steps
- Integrate real logs parsing in a companion notebook to compute functional accuracy and sensor confusion matrix using actual fields (expected vs detected).
- Extend `generator.py` to simulate delays vs schedules for on-time metrics and quantity correctness comparisons.