In [1]:
import random
import pandas as pd
from faker import Faker
from datetime import timedelta, datetime  # Import datetime

In [2]:
data_path = "./data/"

In [3]:
fake = Faker()

**Overall Logical Sequence**:
1. **Patients** → 2. **Providers** → 3. **HealthcareFacilities** → 4. **InsurancePayers** → 5. **PatientCoverage** → 6. **Medications** → 7. **Encounters** → 8. **Diagnoses** → 9. **Procedures** → 10. **MedicationOrders** → 11. **LabTests** → 12. **LabResults** → 13. **Claims** → 14. **ClaimLineItems**

### Reasoning

1. **Patients**:  
   - Standalone. But everything else (Encounters, Coverage, Claims) references them.  
  
2. **Providers**:   
   - Also fairly standalone. But an **Encounter** references a Provider (e.g., a doctor seeing the patient).  
  
3. **HealthcareFacilities**:    
   - Another independent entity that **Encounters** eventually reference (the place of service).   
  
4. **InsurancePayers**:    
   - Must exist before we can define coverage or claims pointing to them.  
  
5. **PatientCoverage**:    
   - Ties a **Patient** to an **InsurancePayer**. We need both in place already.  
  
6. **Medications**:    
   - Must be created before we can **order** them (MedicationOrders).  
  
7. **Encounters**:    
   - Each Encounter references a **Patient**, a **Provider**, and a **HealthcareFacility**.    
   - So those must be built first.  
  
8. **Diagnoses**:    
   - Typically attached to a specific **Encounter** (i.e. one or more diagnoses result from that visit).    
   - So the **Encounter** must exist first.  
  
9. **Procedures**:    
   - Also reference the Encounter. Must come after the Encounter is known.  
   
10. **MedicationOrders**:    
   - An order references a specific **Encounter** (when it was prescribed) and a **Medication** (which drug).    
   - Hence, we need Encounters and Medications first.  
  
11. **LabTests**:    
   - Again, typically linked to a specific **Encounter**.    
   - So must generate Encounters first.   
   
12. **LabResults**:    
   - Each LabResult references a **LabTest**. So we generate **LabTests** first, then results.   
   
13. **Claims**:    
   - A Claim references which **Encounter** it’s billing for, plus a **Payer** (and indirectly the Patient).    
   - So it must come after Encounters and Payers are defined.   
   
14. **ClaimLineItems**:     
   - Belong to a **Claim**. Also might reference procedure codes that came from the Encounter, but from a data generation standpoint, it references the Claim first.     
   - So generate **Claims** first, then line items.   
   
Hence the logic: start with **Patients**, **Providers**, **Facilities**, and **Payers**.   
Then define coverage, then define **Encounters** (which revolve around patient + provider + facility). After that, define sub-entities of the encounter (diagnoses, procedures, meds, labs).     
Finally, build **Claims** referencing the completed encounter data.   

In [4]:
# Configuration
NUM_PATIENTS = 50
NUM_PROVIDERS = 20
NUM_FACILITIES = 10
NUM_PAYERS = 8
NUM_ENCOUNTERS = 120
NUM_DIAGNOSES = 200
NUM_PROCEDURES = 150
NUM_MEDICATION_ORDERS = 100
NUM_MEDICATIONS = 50
NUM_LABTESTS = 80
NUM_LABRESULTS = 150
NUM_CLAIMS = 100
NUM_CLAIM_LINES = 200
NUM_PATIENT_COVERAGE = 60

# Some small sets of valid codes to simulate constraints
ICD10_CODES = ["E11.9", "I10", "J45.909", "K21.9", "M17.11", "Z33.1"]  # random sample
SNOMED_CODES = ["44054006", "195967001"]  # minimal example
CPT_CODES = ["99213", "99214", "99395", "99406"] 
HCPCS_CODES = ["G0101", "J0885"]  # minimal example
RXNORM_CODES = ["1049630", "1044658", "860975"]  # placeholders
LOINC_CODES = ["4548-4", "6298-4", "2345-7"]  # e.g. A1c, RBC count, etc.

In [5]:
# 1 Generate Patients
genders = ["Male", "Female", "Other"]
patients = []
for i in range(NUM_PATIENTS):
    dob = fake.date_of_birth(minimum_age=0, maximum_age=90)
    patients.append({
        "id": f"pat_{i}",
        "firstName": fake.first_name(),
        "lastName": fake.last_name(),
        "birthDate": dob.isoformat(),
        "gender": random.choice(genders),
        "address": fake.address().replace("\n", ", "),
        "phoneNumber": fake.phone_number()
    })

In [6]:
# 2 Generate Providers
specialties = ["General", "Cardiology", "Endocrinology", "Pediatrics", "Surgery"]
providers = []
for i in range(NUM_PROVIDERS):
    providers.append({
        "id": f"prov_{i}",
        "providerName": fake.name(),
        "licenseNumber": f"LIC-{random.randint(1000,9999)}",
        "specialty": random.choice(specialties)
    })

In [7]:
# 3 Generate HealthcareFacilities
facility_types = ["Hospital", "Clinic", "ER", "OutpatientCenter"]
facilities = []
for i in range(NUM_FACILITIES):
    facilities.append({
        "id": f"fac_{i}",
        "facilityName": fake.company() + " " + random.choice(["Hospital", "Center", "Clinic"]),
        "facilityType": random.choice(facility_types),
        "location": fake.city()
    })

In [8]:
# 4 Generate InsurancePayers
payer_types = ["Private", "Medicare", "Medicaid"]
payers = []
for i in range(NUM_PAYERS):
    payers.append({
        "id": f"payer_{i}",
        "payerName": fake.company() + " Insurance",
        "payerType": random.choice(payer_types),
        "contactInfo": fake.phone_number()
    })

In [9]:
# 5 Generate Medications (RxNorm constraints)
medications = []
for i in range(NUM_MEDICATIONS):
    brand = fake.word().title()
    generic = fake.word().lower()
    rxnorm = random.choice(RXNORM_CODES)
    meds_strength = str(random.randint(5, 500)) + "mg"
    medications.append({
        "id": f"med_{i}",
        "brandName": brand,
        "genericName": generic,
        "rxNormCode": rxnorm,
        "strength": meds_strength
    })

In [10]:
# 6 Generate Patient Coverage
patientCoverage = []
for i in range(NUM_PATIENT_COVERAGE):
    pat = random.choice(patients)
    pay = random.choice(payers)
    start = fake.date_between(start_date='-5y', end_date='-1y')
    end = start + timedelta(days=random.randint(180, 1460))  # up to 4 yrs coverage
    if end > datetime.now().date():
        end = datetime.now().date()  # some still active or recently ended
    patientCoverage.append({
        "id": f"cov_{i}",
        "patientID": pat["id"],
        "payerID": pay["id"],
        "coverageStart": start.isoformat(),
        "coverageEnd": end.isoformat()
    })

In [11]:
# 7 Generate Encounters
encounter_types = ["Inpatient", "Outpatient", "ER"]
encounters = []
for i in range(NUM_ENCOUNTERS):
    pat = random.choice(patients)
    fac = random.choice(facilities)
    start_dt = fake.date_time_between(start_date='-2y', end_date='now')
    duration_days = random.randint(0, 5)
    end_dt = start_dt + timedelta(days=duration_days)
    e_type = random.choice(encounter_types)
    reason = fake.sentence(nb_words=5)

    provider_count = random.randint(1, 2)
    # We'll store them as a simple list in data, though in an RDB you'd do a separate table or link
    # For pseudocode, we can just keep track of a main provider or an array
    assigned_provider = random.choice(providers)

    encounters.append({
        "id": f"enc_{i}",
        "encounterID": f"ENC-{random.randint(10000,99999)}",
        "startDateTime": start_dt.isoformat(),
        "endDateTime": end_dt.isoformat(),
        "reasonForVisit": reason,
        "encounterType": e_type,
        "patientID": pat["id"],
        "providerID": assigned_provider["id"],
        "facilityID": fac["id"]
    })

In [12]:
# 8 Generate Diagnoses
# We'll reference random encounters, assign codes
diagnoses = []
for i in range(NUM_DIAGNOSES):
    enc = random.choice(encounters)
    # pick ICD-10 vs SNOMED for codeSystem
    if random.random() < 0.8:
        codeSys = "ICD-10"
        codeVal = random.choice(ICD10_CODES)
    else:
        codeSys = "SNOMED"
        codeVal = random.choice(SNOMED_CODES)

    diagnoses.append({
        "id": f"diag_{i}",
        "encounterID": enc["id"],
        "diagnosisCode": codeVal,
        "codeSystem": codeSys,
        "diagnosisDesc": fake.sentence(nb_words=4)
    })

In [13]:
# 9 Generate Procedures
procedures = []
for i in range(NUM_PROCEDURES):
    enc = random.choice(encounters)
    code_options = random.choice([CPT_CODES, HCPCS_CODES])
    chosen_code = random.choice(code_options)
    # We'll randomly pick a procedureDate within the encounter range
    start_dt = datetime.fromisoformat(enc["startDateTime"])
    end_dt = datetime.fromisoformat(enc["endDateTime"])
    if end_dt <= start_dt:
        proc_dt = start_dt
    else:
        dt_span = end_dt - start_dt
        offset_sec = random.randint(0, int(dt_span.total_seconds()))
        proc_dt = start_dt + timedelta(seconds=offset_sec)

    procedures.append({
        "id": f"proc_{i}",
        "encounterID": enc["id"],
        "procedureCode": chosen_code,
        "codeSystem": "CPT" if chosen_code in CPT_CODES else "HCPCS",
        "procedureDesc": fake.bs(),
        "procedureDate": proc_dt.isoformat()
    })

In [14]:
# 10 Generate MedicationOrders
medication_orders = []
for i in range(NUM_MEDICATION_ORDERS):
    enc = random.choice(encounters)
    # ensure we pick a date within encounter timeframe
    sdt = datetime.fromisoformat(enc["startDateTime"])
    edt = datetime.fromisoformat(enc["endDateTime"])
    if edt < sdt:
        edt = sdt + timedelta(days=1)
    mo_date = sdt + timedelta(days=random.randint(0, max(1, (edt-sdt).days)))
    med = random.choice(medications)
    # end date can't be before start date
    end_d = mo_date + timedelta(days=random.randint(10, 90))

    medication_orders.append({
        "id": f"mo_{i}",
        "encounterID": enc["id"],
        "orderID": f"MO-{random.randint(1000,9999)}",
        "startDate": mo_date.date().isoformat(),
        "endDate": end_d.date().isoformat(),
        "dosage": f"{random.randint(1,3)} tablets",
        "frequency": random.choice(["BID", "TID", "QHS"]),
        "medicationID": med["id"]
    })

In [15]:
# 11 Generate LabTests
lab_tests = []
for i in range(NUM_LABTESTS):
    enc = random.choice(encounters)
    sdt = datetime.fromisoformat(enc["startDateTime"])
    edt = datetime.fromisoformat(enc["endDateTime"])
    if edt < sdt:
        edt = sdt + timedelta(hours=6)

    test_time = sdt + timedelta(hours=random.randint(0, 24 * 3))
    if test_time > edt:
        test_time = edt

    loinc_code = random.choice(LOINC_CODES)

    lab_tests.append({
        "id": f"lt_{i}",
        "encounterID": enc["id"],
        "testID": f"LT-{random.randint(1000,9999)}",
        "testName": "LabTest " + fake.word(),
        "testCode": loinc_code,
        "specimenType": random.choice(["Blood", "Urine", "Saliva"]),
        "testDateTime": test_time.isoformat()
    })

In [16]:
# 12 Generate LabResults
lab_results = []
for i in range(NUM_LABRESULTS):
    lt = random.choice(lab_tests)
    # attempt to keep result within a day or so after test
    tdt = datetime.fromisoformat(lt["testDateTime"])
    result_dt = tdt + timedelta(hours=random.randint(1, 48))
    val = round(random.uniform(0.5, 15.0), 2)
    lab_results.append({
        "id": f"lr_{i}",
        "labTestID": lt["id"],
        "resultValue": val,
        "units": random.choice(["mg/dL", "g/L", "mmol/L"]),
        "referenceRange": "Normal range depends",
        "resultDateTime": result_dt.isoformat()
    })

In [17]:
# 13 Generate Claims
claims = []
for i in range(NUM_CLAIMS):
    enc = random.choice(encounters)
    pay = random.choice(payers)
    billed_amt = round(random.uniform(100, 10000), 2)
    stat_options = ["Submitted", "Paid", "Denied"]
    chosen_stat = random.choice(stat_options)
    paid_amt = 0.0
    if chosen_stat == "Paid":
        paid_amt = round(random.uniform(0.5, 1.0)*billed_amt, 2)  # ensure totalPaid <= totalBilled

    claim_d = datetime.fromisoformat(enc["endDateTime"])
    # claim date slightly after encounter
    claim_date = claim_d + timedelta(days=random.randint(1, 10))

    claims.append({
        "id": f"claim_{i}",
        "claimNumber": f"CL-{random.randint(10000,99999)}",
        "claimDate": claim_date.date().isoformat(),
        "totalBilled": billed_amt,
        "totalPaid": paid_amt,
        "status": chosen_stat,
        "payerID": pay["id"],
        "encounterID": enc["id"]
    })

In [18]:
# 14 Generate ClaimLineItems
claim_line_items = []
for i in range(NUM_CLAIM_LINES):
    cl = random.choice(claims)
    # find procedures from that encounter
    procs_for_enc = [p for p in procedures if p["encounterID"] == cl["encounterID"]]
    if not procs_for_enc:
        continue
    chosen_proc = random.choice(procs_for_enc)
    billed_line = round(random.uniform(50, 1000), 2)
    allowed_line = round(random.uniform(0.5, 1.0)*billed_line, 2)

    claim_line_items.append({
        "id": f"cli_{i}",
        "claimID": cl["id"],
        "lineCode": chosen_proc["procedureCode"],
        "billedAmount": billed_line,
        "allowedAmount": allowed_line
    })

In [19]:
# Summaries
print("\nGenerated Entities:\n")
print("  Patients:", len(patients))
print("  Providers:", len(providers))
print("  Facilities:", len(facilities))
print("  Payers:", len(payers))
print("  Medications:", len(medications))
print("  PatientCoverage:", len(patientCoverage))
print("  Encounters:", len(encounters))
print("  Diagnoses:", len(diagnoses))
print("  Procedures:", len(procedures))
print("  MedicationOrders:", len(medication_orders))
print("  LabTests:", len(lab_tests))
print("  LabResults:", len(lab_results))
print("  Claims:", len(claims))
print("  ClaimLineItems:", len(claim_line_items))


Generated Entities:

  Patients: 50
  Providers: 20
  Facilities: 10
  Payers: 8
  Medications: 50
  PatientCoverage: 60
  Encounters: 120
  Diagnoses: 200
  Procedures: 150
  MedicationOrders: 100
  LabTests: 80
  LabResults: 150
  Claims: 100
  ClaimLineItems: 153


In [20]:
# Print samples
print("\nSample Patient:", patients[0])
print("Sample Provider:", providers[0])
print("Sample Facility:", facilities[0])
print("Sample Payer:", payers[0])
print("Sample Medication:", medications[0])
print("Sample PatientCoverage:", patientCoverage[0] if patientCoverage else None)
print("Sample Encounter:", encounters[0])
print("Sample Diagnosis:", diagnoses[0] if diagnoses else None)
print("Sample Procedure:", procedures[0] if procedures else None)
print("Sample MedicationOrder:", medication_orders[0] if medication_orders else None)
print("Sample LabTest:", lab_tests[0] if lab_tests else None)
print("Sample LabResult:", lab_results[0] if lab_results else None)
print("Sample Claim:", claims[0] if claims else None)
print("Sample ClaimLineItem:", claim_line_items[0] if claim_line_items else None)


Sample Patient: {'id': 'pat_0', 'firstName': 'Andrew', 'lastName': 'Moore', 'birthDate': '2008-06-09', 'gender': 'Male', 'address': '6857 Koch Overpass, West Amandamouth, PR 78131', 'phoneNumber': '001-919-317-8156x237'}
Sample Provider: {'id': 'prov_0', 'providerName': 'Patrick Ryan', 'licenseNumber': 'LIC-1561', 'specialty': 'General'}
Sample Facility: {'id': 'fac_0', 'facilityName': 'Williams-Phillips Hospital', 'facilityType': 'Hospital', 'location': 'Sarahmouth'}
Sample Payer: {'id': 'payer_0', 'payerName': 'Hall and Sons Insurance', 'payerType': 'Medicaid', 'contactInfo': '918-521-7309x501'}
Sample Medication: {'id': 'med_0', 'brandName': 'Draw', 'genericName': 'against', 'rxNormCode': '860975', 'strength': '399mg'}
Sample PatientCoverage: {'id': 'cov_0', 'patientID': 'pat_26', 'payerID': 'payer_3', 'coverageStart': '2022-04-21', 'coverageEnd': '2025-02-19'}
Sample Encounter: {'id': 'enc_0', 'encounterID': 'ENC-99397', 'startDateTime': '2023-03-25T09:42:55', 'endDateTime': '2023

In [21]:
# persist the data
pd.DataFrame(patients).to_csv(data_path+"patients.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(providers).to_csv(data_path+"providers.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(facilities).to_csv(data_path+"facilities.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(payers).to_csv(data_path+"payers.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(medications).to_csv(data_path+"medications.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(patientCoverage).to_csv(data_path+"patient_coverage.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(encounters).to_csv(data_path+"encounters.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(diagnoses).to_csv(data_path+"diagnoses.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(procedures).to_csv(data_path+"procedures.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(medication_orders).to_csv(data_path+"medication_orders.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(lab_tests).to_csv(data_path+"lab_tests.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(lab_results).to_csv(data_path+"lab_results.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(claims).to_csv(data_path+"claims.csv", encoding = "utf-8", escapechar = "\"", index=False)
pd.DataFrame(claim_line_items).to_csv(data_path+"claim_line_items.csv", encoding = "utf-8", escapechar = "\"", index=False)