In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Define hospital and visit reason options
hospitals = ['City Hospital', 'Green Valley Clinic', 'Sunrise Health', 'Oakwood Medical Center']
visit_reasons = ['Flu', 'Routine Checkup', 'Injury', 'Allergy', 'Surgery Consultation', 'COVID-19 Test']

# Create date range from April 1 to May 31, 2025
date_range = pd.date_range(start='2025-04-01', end='2025-05-31')
data = []

for date in date_range:
    for _ in range(random.randint(5, 10)):  # 5–10 visits per day
        visit_time = datetime.combine(date, datetime.min.time()) + timedelta(
            hours=random.randint(7, 17), minutes=random.randint(0, 59)
        )
        check_out_time = visit_time + timedelta(minutes=random.randint(15, 120))
        data.append({
            'patient_id': random.randint(50000, 99999),
            'hospital': random.choice(hospitals),
            'visit_date': date.date().isoformat(),
            'visit_reason': random.choice(visit_reasons),
            'cost': random.randint(50, 1500),
            'check_out_time': check_out_time.isoformat()
        })

# Save dataset to CSV and preview
df = pd.DataFrame(data)
df.to_csv('hospital_visits_apr_may.csv', index=False)
df.head(60)


Unnamed: 0,patient_id,hospital,visit_date,visit_reason,cost,check_out_time
0,82823,Sunrise Health,2025-04-01,Flu,97,2025-04-01T15:04:00
1,92119,Green Valley Clinic,2025-04-01,Flu,763,2025-04-01T09:52:00
2,59283,Green Valley Clinic,2025-04-01,Routine Checkup,448,2025-04-01T14:43:00
3,73539,Green Valley Clinic,2025-04-01,Allergy,1016,2025-04-01T07:53:00
4,86815,City Hospital,2025-04-01,Allergy,1005,2025-04-01T12:05:00
5,78168,Green Valley Clinic,2025-04-02,COVID-19 Test,87,2025-04-02T18:04:00
6,74809,Oakwood Medical Center,2025-04-02,COVID-19 Test,678,2025-04-02T13:42:00
7,80226,Oakwood Medical Center,2025-04-02,Surgery Consultation,407,2025-04-02T16:47:00
8,52328,Oakwood Medical Center,2025-04-02,Allergy,1198,2025-04-02T07:49:00
9,97346,City Hospital,2025-04-02,Injury,661,2025-04-02T15:07:00


In [3]:
 # FULL EXTRACTION
df_full = pd.read_csv("hospital_visits_apr_may.csv", parse_dates=["check_out_time"])
print(f"Pulled {len(df_full)} rows via full extraction.")
df_full.head()


Pulled 450 rows via full extraction.


Unnamed: 0,patient_id,hospital,visit_date,visit_reason,cost,check_out_time
0,78672,City Hospital,2025-04-01,Injury,1474,2025-04-01 18:25:00
1,88708,City Hospital,2025-04-01,COVID-19 Test,1406,2025-04-01 14:22:00
2,85521,Sunrise Health,2025-04-01,Allergy,1191,2025-04-01 11:34:00
3,58044,Sunrise Health,2025-04-01,Injury,1320,2025-04-01 14:54:00
4,89564,City Hospital,2025-04-01,COVID-19 Test,1193,2025-04-01 11:01:00


In [9]:
# Set initial last extraction time (e.g., halfway through the data range)
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00") 


In [4]:
# INCREMENTAL EXTRACTION

with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
    
df = pd.read_csv("hospital_visits_apr_may.csv", parse_dates=["check_out_time"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['check_out_time'] > last_extraction_time]
print(f"Pulled {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()

Pulled 2 new/updated rows since 2025-05-31T14:34:00.


Unnamed: 0,patient_id,hospital,visit_date,visit_reason,cost,check_out_time
445,70696,Sunrise Health,2025-05-31,COVID-19 Test,1203,2025-05-31 16:00:00
446,75131,Sunrise Health,2025-05-31,COVID-19 Test,1171,2025-05-31 18:16:00


In [5]:
# Get the most recent update
new_checkpoint = df['check_out_time'].max()

In [6]:
# Save it
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")


Updated last_extraction.txt to 2025-05-31 18:16:00
