In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Define hospital and visit reason options
hospitals = ['City Hospital', 'Green Valley Clinic', 'Sunrise Health', 'Oakwood Medical Center']
visit_reasons = ['Flu', 'Routine Checkup', 'Injury', 'Allergy', 'Surgery Consultation', 'COVID-19 Test']

# Create date range from April 1 to May 31, 2025
date_range = pd.date_range(start='2025-04-01', end='2025-05-31')
data = []

for date in date_range:
    for _ in range(random.randint(5, 10)):  # 5–10 visits per day
        visit_time = datetime.combine(date, datetime.min.time()) + timedelta(
            hours=random.randint(7, 17), minutes=random.randint(0, 59)
        )
        check_out_time = visit_time + timedelta(minutes=random.randint(15, 120))
        data.append({
            'patient_id': random.randint(50000, 99999),
            'hospital': random.choice(hospitals),
            'visit_date': date.date().isoformat(),
            'visit_reason': random.choice(visit_reasons),
            'cost': random.randint(50, 1500),
            'check_out_time': check_out_time.isoformat()
        })

# Save dataset to CSV and preview
df = pd.DataFrame(data)
df.to_csv('hospital_visits_apr_may.csv', index=False)
df.head(60)


Unnamed: 0,patient_id,hospital,visit_date,visit_reason,cost,check_out_time
0,53246,Green Valley Clinic,2025-04-01,Surgery Consultation,1123,2025-04-01T11:51:00
1,90424,Oakwood Medical Center,2025-04-01,Routine Checkup,1208,2025-04-01T09:47:00
2,88606,Sunrise Health,2025-04-01,COVID-19 Test,342,2025-04-01T18:09:00
3,87403,Oakwood Medical Center,2025-04-01,COVID-19 Test,1274,2025-04-01T17:23:00
4,88520,Oakwood Medical Center,2025-04-01,Allergy,199,2025-04-01T16:57:00
5,87261,Green Valley Clinic,2025-04-01,Injury,707,2025-04-01T14:34:00
6,77600,Green Valley Clinic,2025-04-01,Flu,621,2025-04-01T10:45:00
7,83339,Oakwood Medical Center,2025-04-01,Surgery Consultation,830,2025-04-01T13:44:00
8,84973,Sunrise Health,2025-04-01,Flu,787,2025-04-01T11:39:00
9,82154,Oakwood Medical Center,2025-04-01,Allergy,296,2025-04-01T11:37:00


In [8]:
 # FULL EXTRACTION
df_full = pd.read_csv("hospital_visits.csv", parse_dates=["check_out_time"])
print(f"Pulled {len(df_full)} rows via full extraction.")
df_full.head()


Pulled 453 rows via full extraction.


Unnamed: 0,patient_id,hospital,visit_date,visit_reason,cost,check_out_time
0,60542,Sunrise Health,2025-04-02,Surgery Consultation,332,2025-04-02 10:00:00
1,72600,Oakwood Medical Center,2025-04-02,Injury,194,2025-04-02 14:01:00
2,75961,Oakwood Medical Center,2025-04-02,Injury,1076,2025-04-02 15:38:00
3,96648,Green Valley Clinic,2025-04-02,Routine Checkup,1468,2025-04-02 11:09:00
4,81979,Green Valley Clinic,2025-04-02,Allergy,64,2025-04-02 10:14:00


In [9]:
# Set initial last extraction time (e.g., halfway through the data range)
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00") 


In [14]:
# INCREMENTAL EXTRACTION

with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
    
df = pd.read_csv("hospital_visits.csv", parse_dates=["check_out_time"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['check_out_time'] > last_extraction_time]
print(f"Pulled {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()

Pulled 0 new/updated rows since 2025-05-31 19:42:00.


Unnamed: 0,patient_id,hospital,visit_date,visit_reason,cost,check_out_time


In [19]:
# Get the most recent update
new_checkpoint = df['check_out_time'].max()

In [18]:
# Save it
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")


Updated last_extraction.txt to 2025-05-31 14:34:00
