# Data Wrangling for encounters.csv
This notebook processes the `encounters.csv` by performing the following steps:
- Inspecting and cleaning the data
- Handling missing values and inconsistencies
- Transforming the data (e.g., creating new features)
- Saving the processed data to a new file

In [1]:
import pandas as pd
import numpy as np
import yaml
from pprint import pprint
import os
import sys 
from IPython.display import display, JSON

# Add the directory containing utils.py to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../utils")))

# import utils
from utils import func_rename_and_cast_columns


In [2]:
"""
This cell checks whether the file '../../data/processed_data/processed_patients.csv' exists.
If the file does not exist, it prints a message prompting the user to run 'etl/notebooks/etl_patients.ipynb'.
This is useful for ensuring that the necessary preprocessing step has been completed before running the script.
Exception handling is included to capture any unexpected errors.
"""
try:
    filepath_patients_csv = '../../data/processed_data/processed_patients.csv' # READ For join operations
    if not os.path.exists(filepath_patients_csv):
        print(f"{filepath_patients_csv} does not exist. Please run the etl/notebooks/etl_patients.ipynb first.")
        sys.exit()
    else:
        print(f"{filepath_patients_csv} exists. Proceed with the processing.")
except Exception as e:
    print(f"An error occurred: {e}")

../../data/processed_data/processed_patients.csv exists. Proceed with the processing.


In [3]:
# Load the dataset

filepath_csv = '../../data/raw_data/encounters.csv' # Read CSV 
filepath_output = '../../data/processed_data/processed_encounters.csv' # Write processed CSV
filepath_yaml = '../../config/encounters.yaml' # Read encounters.yaml, it is used to clean column names and apply relevant types to columns


# Load Dataframe
df = pd.read_csv(filepath_csv)
df_patients = pd.read_csv(filepath_patients_csv)

# Load YAML column mappings 
with open(filepath_yaml, "r") as file:
    dict_column_mappings = yaml.safe_load(file)
    
# Display initial dataset information
print('Initial Dataset Info:')
df.info()
df.head(5)

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7049 entries, 0 to 7048
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   7049 non-null   object 
 1   START                7049 non-null   object 
 2   STOP                 7049 non-null   object 
 3   PATIENT              7049 non-null   object 
 4   ORGANIZATION         7049 non-null   object 
 5   PROVIDER             7049 non-null   object 
 6   PAYER                7049 non-null   object 
 7   ENCOUNTERCLASS       7049 non-null   object 
 8   CODE                 7049 non-null   int64  
 9   DESCRIPTION          7049 non-null   object 
 10  BASE_ENCOUNTER_COST  7049 non-null   float64
 11  TOTAL_CLAIM_COST     7049 non-null   float64
 12  PAYER_COVERAGE       7049 non-null   float64
 13  REASONCODE           4613 non-null   float64
 14  REASONDESCRIPTION    4613 non-null   object 
dtypes: float64(4), i

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
0,294d0dab-907e-8fce-7a47-0c0d322a5734,2012-04-01T09:04:48Z,2012-04-01T10:02:47Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,1567.0,87.2,,
1,2ccec874-cbaa-e280-7abb-f2bc2b603961,2013-04-07T09:04:48Z,2013-04-07T09:55:49Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,704.2,0.0,,
2,953c5138-ce17-4084-3432-1ac23f184528,2015-09-28T09:04:48Z,2015-09-28T11:02:48Z,30a6452c-4297-a1ac-977a-6a23237c7b46,db106514-f254-3402-b6a4-6d210c78c7e2,2c4b7d17-0ded-3e16-b5eb-6dda1d6a81bb,d31fccc3-1767-390d-966a-22a5156f4219,emergency,50849002,Emergency room admission (procedure),146.18,1008.98,0.0,125605004.0,Fracture of bone (disorder)
3,17dd3b88-0b85-2b6f-c342-c9d6cf5315cb,2015-10-31T11:02:48Z,2015-10-31T11:17:48Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f8918a95-31e8-3ac4-8d12-29ca6080ebda,b4d9fbc9-fdca-369d-bbba-019479923f08,d31fccc3-1767-390d-966a-22a5156f4219,ambulatory,185349003,Encounter for check up (procedure),85.55,85.55,3.95,359817006.0,Closed fracture of hip (disorder)
4,0b03e41b-06a6-66fa-b972-acc5a83b134a,2016-04-10T09:04:48Z,2016-04-10T10:00:45Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,2039.18,464.94,,


# Data Cleaning and Data Quality Checks

1. Standardise column names
2. Apply relevant types
3. Fill null values with relevant values
4. Perform the logical testing (start_time < end_time)

In [4]:
display(JSON(dict_column_mappings['columns']))

<IPython.core.display.JSON object>

In [5]:
# Standardise column names and clean the data
df = func_rename_and_cast_columns(df, dict_column_mappings['columns'])

Renamed column: 'Id' to 'encounter_id'.
Column 'encounter_id' cast to type 'str'.
--------------------
Renamed column: 'START' to 'start_time'.
Column 'start_time' cast to type 'datetimestamp'.
--------------------
Renamed column: 'STOP' to 'stop_time'.
Column 'stop_time' cast to type 'datetimestamp'.
--------------------
Renamed column: 'PATIENT' to 'patient_id'.
Column 'patient_id' cast to type 'str'.
--------------------
Renamed column: 'ORGANIZATION' to 'organization_id'.
Column 'organization_id' cast to type 'str'.
--------------------
Renamed column: 'PROVIDER' to 'provider_id'.
Column 'provider_id' cast to type 'str'.
--------------------
Renamed column: 'PAYER' to 'payer_id'.
Column 'payer_id' cast to type 'str'.
--------------------
Renamed column: 'ENCOUNTERCLASS' to 'encounter_class'.
Column 'encounter_class' cast to type 'str'.
--------------------
Renamed column: 'CODE' to 'code'.
Column 'code' cast to type 'str'.
--------------------
Renamed column: 'DESCRIPTION' to 'desc

In [6]:
# Dispplay the dataframe
df.head(5)

Unnamed: 0,encounter_id,start_time,stop_time,patient_id,organization_id,provider_id,payer_id,encounter_class,code,description,base_encounter_cost,total_claim_cost,payer_coverage,reason_code,reason_description
0,294d0dab-907e-8fce-7a47-0c0d322a5734,2012-04-01 09:04:48,2012-04-01 10:02:47,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,1567.0,87.2,Unknown,Unknown
1,2ccec874-cbaa-e280-7abb-f2bc2b603961,2013-04-07 09:04:48,2013-04-07 09:55:49,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,704.2,0.0,Unknown,Unknown
2,953c5138-ce17-4084-3432-1ac23f184528,2015-09-28 09:04:48,2015-09-28 11:02:48,30a6452c-4297-a1ac-977a-6a23237c7b46,db106514-f254-3402-b6a4-6d210c78c7e2,2c4b7d17-0ded-3e16-b5eb-6dda1d6a81bb,d31fccc3-1767-390d-966a-22a5156f4219,emergency,50849002,Emergency room admission (procedure),146.18,1008.98,0.0,125605004.0,Fracture of bone (disorder)
3,17dd3b88-0b85-2b6f-c342-c9d6cf5315cb,2015-10-31 11:02:48,2015-10-31 11:17:48,30a6452c-4297-a1ac-977a-6a23237c7b46,f8918a95-31e8-3ac4-8d12-29ca6080ebda,b4d9fbc9-fdca-369d-bbba-019479923f08,d31fccc3-1767-390d-966a-22a5156f4219,ambulatory,185349003,Encounter for check up (procedure),85.55,85.55,3.95,359817006.0,Closed fracture of hip (disorder)
4,0b03e41b-06a6-66fa-b972-acc5a83b134a,2016-04-10 09:04:48,2016-04-10 10:00:45,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,2039.18,464.94,Unknown,Unknown


# Data Quality
1. Remove Duplicates
2. See distinct values

In [7]:
# Check for duplicate rows
print(f" (INEFFICIENT FOR LARGE DATAFRAMES) Total length of dataframe BEFORE removing duplicates: {len(df)}")
df = df.drop_duplicates()
print(f" (INEFFICIENT FOR LARGE DATAFRAMES) Total length of dataframe AFTER removing duplicates: {len(df)}")

# Validate categorical columns
categorical_columns = ['encounter_class', 'description', 'reason_description']
for col in categorical_columns:
    distinct_vals = df[col].unique() 
    tot_len = len(distinct_vals)
    if tot_len < 12:
        print(f'Unique values in {col}: {df[col].unique()}')
    else:
        print(f'Unique values too large for display: {col}: has total {tot_len} distinct values')

 (INEFFICIENT FOR LARGE DATAFRAMES) Total length of dataframe BEFORE removing duplicates: 7049
 (INEFFICIENT FOR LARGE DATAFRAMES) Total length of dataframe AFTER removing duplicates: 7049
Unique values in encounter_class: ['wellness' 'emergency' 'ambulatory' 'outpatient' 'inpatient' 'urgentcare'
 'hospice' 'snf' 'virtual' 'home']
Unique values too large for display: description: has total 45 distinct values
Unique values too large for display: reason_description: has total 106 distinct values


# Data Engineering
1. Do the logical test (start_time < stop_time)
2. Impute new columns like length of stay

In [8]:
# Logical Testing
valid_dates = df[df['stop_time'] >= df['start_time']]
invalid_dates = df[df['stop_time'] < df['start_time']]
print(f"total entries in df: {len(df)}, valid_dates: {len(valid_dates)}, invalid_dates:{len(invalid_dates)}  ")

# Length of Stay in hours
df['length_of_stay_hours'] = (df['stop_time'] - df['start_time']).dt.total_seconds() / 3600.0

total entries in df: 7049, valid_dates: 7049, invalid_dates:0  


# Left Join with patients.csv to include:
1. Demographics of patient
2. Other relevant information of patient
3. Impute age of patient
4. Impute age category of patient

In [9]:
# Left Join with Patients
patient_columns_needed = ['id', 'birthdate', 'marital', 'race', 'ethnicity', 'gender', 'income', 'income_category']
df_patients = df_patients[patient_columns_needed]
# df_patients.head(5)
print(f"len of df BEFORE left-join: {len(df)}")
df = pd.merge(df, df_patients, how='left', left_on='patient_id', right_on='id')

# remove 'id' from right df
df.drop(['id'], axis=1, inplace=True)

#results.head(5)
print(f"len of df AFTER left-join: {len(df)}")

# Impute Age of Patient
df['birthdate'] = pd.to_datetime(df['birthdate'], errors='coerce')
df['start_time'] = pd.to_datetime(df['start_time'], errors='coerce')
df['age_of_patient'] = (df['start_time'] - df['birthdate']).dt.days / 365.0

# Categorize Age
bins = [0, 17, 65, float("inf")]
labels = ["children", "adult", "senior"]

# Create a new column for income category
df["age_category"] = pd.cut(df["age_of_patient"], bins=bins, labels=labels, right=False)

len of df BEFORE left-join: 7049
len of df AFTER left-join: 7049


In [10]:
df.head(5)

Unnamed: 0,encounter_id,start_time,stop_time,patient_id,organization_id,provider_id,payer_id,encounter_class,code,description,...,length_of_stay_hours,birthdate,marital,race,ethnicity,gender,income,income_category,age_of_patient,age_category
0,294d0dab-907e-8fce-7a47-0c0d322a5734,2012-04-01 09:04:48,2012-04-01 10:02:47,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),...,0.966389,1994-02-06,M,white,nonhispanic,M,100511,high-income,18.161644,adult
1,2ccec874-cbaa-e280-7abb-f2bc2b603961,2013-04-07 09:04:48,2013-04-07 09:55:49,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),...,0.850278,1994-02-06,M,white,nonhispanic,M,100511,high-income,19.178082,adult
2,953c5138-ce17-4084-3432-1ac23f184528,2015-09-28 09:04:48,2015-09-28 11:02:48,30a6452c-4297-a1ac-977a-6a23237c7b46,db106514-f254-3402-b6a4-6d210c78c7e2,2c4b7d17-0ded-3e16-b5eb-6dda1d6a81bb,d31fccc3-1767-390d-966a-22a5156f4219,emergency,50849002,Emergency room admission (procedure),...,1.966667,1994-02-06,M,white,nonhispanic,M,100511,high-income,21.654795,adult
3,17dd3b88-0b85-2b6f-c342-c9d6cf5315cb,2015-10-31 11:02:48,2015-10-31 11:17:48,30a6452c-4297-a1ac-977a-6a23237c7b46,f8918a95-31e8-3ac4-8d12-29ca6080ebda,b4d9fbc9-fdca-369d-bbba-019479923f08,d31fccc3-1767-390d-966a-22a5156f4219,ambulatory,185349003,Encounter for check up (procedure),...,0.25,1994-02-06,M,white,nonhispanic,M,100511,high-income,21.745205,adult
4,0b03e41b-06a6-66fa-b972-acc5a83b134a,2016-04-10 09:04:48,2016-04-10 10:00:45,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),...,0.9325,1994-02-06,M,white,nonhispanic,M,100511,high-income,22.189041,adult


In [11]:
# Save the cleaned and transformed dataset
df.to_csv(filepath_output, index=False)
print(f'Processed data saved to {filepath_output}')

Processed data saved to ../../data/processed_data/processed_encounters.csv
