File to test dataframe implementations and read initial core data.
As the data was generated in an artificial way, this saved us some time on cleaning and sorting data, which is irrelevant to this specific project.

In [None]:
import pandas as pd
df = pd.read_csv(r"C:\Users\sorin.creanga\Desktop\Medcare\base-data\ER Wait Time Dataset.csv", index_col=0)
df.info

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_synthetic_data(n_samples=5000): # artificial realistic ED data
 
    np.random.seed(42)
    
    # realistic data distributions
    health_levels = ['RESUSCITATION', 'EMERGENT', 'URGENT', 'SEMI_URGENT', 'NON_URGENT']
    health_probabilities = [0.05, 0.15, 0.30, 0.35, 0.15] 
    
    complaints = [
        'Chest Pain', 'Fever', 'Broken Arm', 'Headache', 'Breathing Issues',
        'Stomach Pain', 'Back Pain', 'Dizziness', 'Nausea', 'Laceration',
        'Allergic Reaction', 'Anxiety', 'Fracture', 'Burns', 'Syncope'
    ]
    
    dispositions = ['Admitted', 'Discharged', 'AMA', 'Transferred', 'Observation']
    disposition_probs = [0.30, 0.60, 0.05, 0.03, 0.02]
    
    
    data = {
        'patient_id': range(1, n_samples + 1),
        'anonymousId': [f'PT-{i:05d}' for i in range(1, n_samples + 1)],
        
        # arrival times with realistic patterns (busier in morning/evening)
        'arrival_time': [
            datetime.now() - timedelta(
                hours=np.random.exponential(scale=12),
                minutes=np.random.randint(0, 60)
            ) for _ in range(n_samples)
        ],
        
    
        'triage': np.random.choice(health_levels, n_samples, p=health_probabilities),
        
       
        'complaint': np.random.choice(complaints, n_samples),
        
        
        'wait_time_minutes': np.random.exponential(scale=45, size=n_samples),
        
        
        'door_to_doctor_minutes': np.random.exponential(scale=60, size=n_samples),
        
        
        'length_of_stay_minutes': np.random.exponential(scale=120, size=n_samples),
        
       
        'left_without_seen': np.random.binomial(1, 0.06, n_samples),
        
        
        'discharge_disposition': np.random.choice(
            dispositions, n_samples, p=disposition_probs
        ),
        
        
        'patient_age': np.random.gamma(shape=2, scale=25, size=n_samples).astype(int) + 5,
        
        
        'gender': np.random.choice(['M', 'F'], n_samples, p=[0.48, 0.52]),
        

    }
    return pd.DataFrame(data)
    

In [7]:
# Testing some data from the Dataframe

print(generate_synthetic_data(2000).head())

   patient_id anonymousId               arrival_time       triage  \
0           1    PT-00001 2025-11-22 03:48:07.859566  SEMI_URGENT   
1           2    PT-00002 2025-11-22 07:21:05.825666       URGENT   
2           3    PT-00003 2025-11-21 22:18:41.122730  SEMI_URGENT   
3           4    PT-00004 2025-11-22 02:26:59.767155       URGENT   
4           5    PT-00005 2025-11-22 08:47:55.206249  SEMI_URGENT   

      complaint  wait_time_minutes  door_to_doctor_minutes  \
0  Stomach Pain          16.925223               93.811388   
1       Anxiety          43.607272                8.600574   
2    Laceration          45.768535               22.314017   
3     Back Pain          54.086582               57.383975   
4         Burns           5.345853               70.455397   

   length_of_stay_minutes  left_without_seen discharge_disposition  \
0               15.791645                  0            Discharged   
1               79.968154                  0              Admitted   
2 