In [5]:
import pandas as pd
import random
from datetime import datetime, timedelta

# --------------------------------------------
# Setup
# --------------------------------------------
roles = ["Patient", "Radiation Oncology", "Urologic Oncology"]
races = [
    "Black or African American", "White", "Asian",
    "American Indian or Alaska Native"
]
next_steps = [
    "Still deciding", "Further workup", "Active surveillance",
    "Radiation", "Surgery"
]

num_patients = 10000
num_questions = 10
num_radiologists = num_patients // 10
num_urologists = num_patients // 10

# Specialist IDs
radiologist_ids = [f"R{i:03d}" for i in range(1, num_radiologists + 1)]
urologist_ids = [f"U{i:03d}" for i in range(1, num_urologists + 1)]

records = []

def generate_answers():
    return [random.randint(1, 6) for _ in range(num_questions)]

def skewed_distance():
    return int(random.betavariate(2, 3) * 400)

def random_appointment_date():
    offset_days = random.choices(
        population=[random.randint(0, 90), random.randint(91, 180), random.randint(181, 364)],
        weights=[3, 3, 2]
    )[0]
    return (datetime.today() - timedelta(days=offset_days)).strftime("%Y-%m-%d")

# --------------------------------------------
# Record creation
# --------------------------------------------
for i in range(1, num_patients + 1):
    base_index = f"{i:03d}"
    treatment_id = f"T{base_index}"
    patient_id = f"P{base_index}"

    assigned_ro_id = random.choice(radiologist_ids)
    assigned_uo_id = random.choice(urologist_ids)

    age = random.choices(
        population=[random.randint(45, 49), random.randint(50, 59), random.randint(60, 69), random.randint(70, 79), random.randint(80, 90)],
        weights=[1, 3, 3, 2, 1]
    )[0]
    race = random.choice(races)
    distance = skewed_distance()
    next_step = random.choice(next_steps)
    appt_date = random_appointment_date()

    # --- Patient Record ---
    answers = generate_answers()
    patient_record = {
        "id": patient_id,
        "treatment_id": treatment_id,
        "submission_role": "Patient",
        "appointment_date": appt_date,
        "next_steps": next_step,
        "score_below_3": any(a <= 3 for a in answers),
        "age": age,
        "race": race,
        "distance_miles": distance,
        "radiation_oncologist_id": assigned_ro_id,
        "urologist_oncologist_id": assigned_uo_id
    }
    patient_record.update({f"survey_q{j}": val for j, val in enumerate(answers, 1)})
    records.append(patient_record)

    # --- Radiation Oncologist ---
    answers = generate_answers()
    ro_record = {
        "id": assigned_ro_id,
        "treatment_id": treatment_id,
        "submission_role": "Radiation Oncology",
        "appointment_date": appt_date,
        "next_steps": next_step,
        "score_below_3": any(a <= 3 for a in answers),
        "age": None,
        "race": None,
        "distance_miles": None,
        "linked_patient_id": patient_id
    }
    ro_record.update({f"survey_q{j}": val for j, val in enumerate(answers, 1)})
    records.append(ro_record)

    # --- Urologist ---
    answers = generate_answers()
    uo_record = {
        "id": assigned_uo_id,
        "treatment_id": treatment_id,
        "submission_role": "Urologic Oncology",
        "appointment_date": appt_date,
        "next_steps": next_step,
        "score_below_3": any(a <= 3 for a in answers),
        "age": None,
        "race": None,
        "distance_miles": None,
        "linked_patient_id": patient_id
    }
    uo_record.update({f"survey_q{j}": val for j, val in enumerate(answers, 1)})
    records.append(uo_record)

# --------------------------------------------
# Save
# --------------------------------------------
df = pd.DataFrame(records)
df.to_csv("radar_survey_synthetic_data.csv", index=False)
print("✔ Varied and balanced data saved to radar_survey_synthetic_data.csv")


✔ Varied and balanced data saved to radar_survey_synthetic_data.csv
