In [None]:
# Goal: Start with the CSV, and end with a dict of Patients and events

In [None]:
!pip install pandas
!pip install femr==0.1.0 # Since this is pip-installable now, we can fix tutorials to a specific version

In [3]:
import pandas as pd

input_df = pd.read_csv("example.csv")
input_df.head()

Unnamed: 0,patient_id,start,code,value,units,dosage
0,3,1970-01-07,Birth/Birth,,,
1,3,1990-01-07,Gender/Gender,Female,,
2,3,1990-01-07,Race/Race,White,,
3,3,2022-05-03,ICD10CM/E11.4,,,
4,3,2022-06-05,ICD10CM/E10.1,,,


In [14]:
from femr import Patient, Event
from datetime import datetime
# Ok. This is the "ETL"

def transform_fn(source: pd.DataFrame, patient_id: int) -> Patient:
    events = []
    # "Query" the dataset for events related to the patient
    events_df = source[source["patient_id"] == patient_id]
    # For each row, create an Event object
    for _, row in events_df.iterrows():
        events.append(
            # More code for null handling, though this is the rough idea.
            # There are ways to abstract this based on data handling decisions.
            Event(
                datetime.strptime(row["start"], "%Y-%M-%d"), 
                row["code"], 
                row["value"])
            )
    # Return a Patient object with the Events
    return Patient(patient_id, events)

transform_fn(input_df, 3)

Patient(patient_id=3, events=[Event(start=1970-01-07 00:01:00, code=Birth/Birth, value=nan), Event(start=1990-01-07 00:01:00, code=Gender/Gender, value=Female), Event(start=1990-01-07 00:01:00, code=Race/Race, value=White), Event(start=2022-01-03 00:05:00, code=ICD10CM/E11.4, value=nan), Event(start=2022-01-05 00:06:00, code=ICD10CM/E10.1, value=nan), Event(start=2020-01-09 00:07:00, code=Vitals/Blood Pressure, value=160), Event(start=2020-01-09 00:08:00, code=Vitals/HbA1c, value=7), Event(start=2022-01-05 00:06:00, code=Drug/Atorvastatin, value=nan), Event(start=2022-01-06 00:07:00, code=Drug/Multivitamins, value=nan), Event(start=2022-01-05 00:06:00, code=Note/ProgressNote, value=Patient Bob came to the clinic today), Event(start=2022-01-06 00:06:00, code=Note/ProgressNote, value=Complicated notes generally need escaping , " 
 example)])

In [15]:
# If we had multiple patient IDs, we can do this:
patient_ids = input_df["patient_id"].unique()
patients = [transform_fn(input_df, pid) for pid in patient_ids]
patients

[Patient(patient_id=3, events=[Event(start=1970-01-07 00:01:00, code=Birth/Birth, value=nan), Event(start=1990-01-07 00:01:00, code=Gender/Gender, value=Female), Event(start=1990-01-07 00:01:00, code=Race/Race, value=White), Event(start=2022-01-03 00:05:00, code=ICD10CM/E11.4, value=nan), Event(start=2022-01-05 00:06:00, code=ICD10CM/E10.1, value=nan), Event(start=2020-01-09 00:07:00, code=Vitals/Blood Pressure, value=160), Event(start=2020-01-09 00:08:00, code=Vitals/HbA1c, value=7), Event(start=2022-01-05 00:06:00, code=Drug/Atorvastatin, value=nan), Event(start=2022-01-06 00:07:00, code=Drug/Multivitamins, value=nan), Event(start=2022-01-05 00:06:00, code=Note/ProgressNote, value=Patient Bob came to the clinic today), Event(start=2022-01-06 00:06:00, code=Note/ProgressNote, value=Complicated notes generally need escaping , " 
  example)])]