# Data Wrangling for procedures.csv
This notebook processes the `procedures.csv` by performing the following steps:
- Inspecting and cleaning the data
- Handling missing values and inconsistencies
- Transforming the data (e.g., creating new features)
- Saving the processed data to a new file

In [1]:
import pandas as pd
import numpy as np
import yaml
from pprint import pprint
import os
import sys 
from IPython.display import display, JSON

# Add the directory containing utils.py to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../utils")))

# import utils
from utils import func_rename_and_cast_columns


In [2]:
"""
This cell checks whether the file '../../data/processed_data/processed_patients.csv' exists.
If the file does not exist, it prints a message prompting the user to run 'etl/notebooks/etl_patients.ipynb'.
This is useful for ensuring that the necessary preprocessing step has been completed before running the script.
Exception handling is included to capture any unexpected errors.
"""
try:
    filepath_patients_csv = '../../data/processed_data/processed_patients.csv' # READ For join operations
    if not os.path.exists(filepath_patients_csv):
        print(f"{filepath_patients_csv} does not exist. Please run the etl/notebooks/etl_patients.ipynb first.")
        sys.exit()
    else:
        print(f"{filepath_patients_csv} exists. Proceed with the processing.")
except Exception as e:
    print(f"An error occurred: {e}")

../../data/processed_data/processed_patients.csv exists. Proceed with the processing.


In [3]:
# Load the dataset

filepath_csv = '../../data/raw_data/procedures.csv' # Read CSV 
filepath_output = '../../data/processed_data/processed_procedures.csv' # Write processed CSV
filepath_yaml = '../../config/procedures.yaml' # Read encounters.yaml, it is used to clean column names and apply relevant types to columns

# Load Dataframe
df = pd.read_csv(filepath_csv)
df_patients = pd.read_csv(filepath_patients_csv)

# Load YAML column mappings 
with open(filepath_yaml, "r") as file:
    dict_column_mappings = yaml.safe_load(file)
    
# Display initial dataset information
print('Initial Dataset Info:')
df.info()
df.head(5)

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17784 entries, 0 to 17783
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   START              17784 non-null  object 
 1   STOP               17784 non-null  object 
 2   PATIENT            17784 non-null  object 
 3   ENCOUNTER          17784 non-null  object 
 4   SYSTEM             17784 non-null  object 
 5   CODE               17784 non-null  int64  
 6   DESCRIPTION        17784 non-null  object 
 7   BASE_COST          17784 non-null  float64
 8   REASONCODE         9139 non-null   float64
 9   REASONDESCRIPTION  9139 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 1.4+ MB


Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,SYSTEM,CODE,DESCRIPTION,BASE_COST,REASONCODE,REASONDESCRIPTION
0,2015-09-28T09:04:48Z,2015-09-28T09:34:48Z,30a6452c-4297-a1ac-977a-6a23237c7b46,953c5138-ce17-4084-3432-1ac23f184528,http://snomed.info/sct,713021002,Plain X-ray of pelvis (procedure),431.4,,
1,2015-09-28T09:04:48Z,2015-09-28T11:02:48Z,30a6452c-4297-a1ac-977a-6a23237c7b46,953c5138-ce17-4084-3432-1ac23f184528,http://snomed.info/sct,305428000,Admission to orthopedic department (procedure),431.4,359817006.0,Closed fracture of hip (disorder)
2,2016-04-10T09:04:48Z,2016-04-10T09:19:48Z,30a6452c-4297-a1ac-977a-6a23237c7b46,0b03e41b-06a6-66fa-b972-acc5a83b134a,http://snomed.info/sct,430193006,Medication reconciliation (procedure),852.4,,
3,2016-04-10T09:04:48Z,2016-04-10T10:00:45Z,30a6452c-4297-a1ac-977a-6a23237c7b46,0b03e41b-06a6-66fa-b972-acc5a83b134a,http://snomed.info/sct,710824005,Assessment of health and social care needs (pr...,431.4,,
4,2016-04-10T10:00:45Z,2016-04-10T10:22:03Z,30a6452c-4297-a1ac-977a-6a23237c7b46,0b03e41b-06a6-66fa-b972-acc5a83b134a,http://snomed.info/sct,710841007,Assessment of anxiety (procedure),431.4,,


In [4]:
display(JSON(dict_column_mappings['columns']))
# func_rename_and_cast_columns(df, dict_column_mappings)

<IPython.core.display.JSON object>

# Data Cleaning and Data Quality Checks

1. Standardise column names
2. Apply relevant types
3. Fill null values with relevant values
4. Perform the logical testing (start_time < end_time)

In [5]:
# Standardise column names
df = func_rename_and_cast_columns(df, dict_column_mappings['columns'])

Renamed column: 'START' to 'start_time'.
Column 'start_time' cast to type 'datetimestamp'.
--------------------
Renamed column: 'STOP' to 'stop_time'.
Column 'stop_time' cast to type 'datetimestamp'.
--------------------
Renamed column: 'PATIENT' to 'patient_id'.
Column 'patient_id' cast to type 'str'.
--------------------
Renamed column: 'ENCOUNTER' to 'encounter_id'.
Column 'encounter_id' cast to type 'str'.
--------------------
Renamed column: 'SYSTEM' to 'system'.
Column 'system' cast to type 'str'.
--------------------
Renamed column: 'CODE' to 'snomed_code'.
Column 'snomed_code' cast to type 'str'.
--------------------
Renamed column: 'DESCRIPTION' to 'procedure_description'.
Column 'procedure_description' cast to type 'str'.
--------------------
Renamed column: 'BASE_COST' to 'base_cost'.
Column 'base_cost' cast to type 'numeric'.
--------------------
Renamed column: 'REASONCODE' to 'reason_code_for_procedure'.
Column 'reason_code_for_procedure' cast to type 'str'.
-------------

In [6]:
# Dispplay the dataframe
# df.head(5)
df.columns.tolist()

['start_time',
 'stop_time',
 'patient_id',
 'encounter_id',
 'system',
 'snomed_code',
 'procedure_description',
 'base_cost',
 'reason_code_for_procedure',
 'reason_description_for_procedure']

# Data Quality
1. Remove Duplicates
2. See distinct values

In [7]:
# Check for duplicate rows
print(f" (INEFFICIENT FOR LARGE DATAFRAMES) Total length of dataframe BEFORE removing duplicates: {len(df)}")
df = df.drop_duplicates()
print(f" (INEFFICIENT FOR LARGE DATAFRAMES) Total length of dataframe AFTER removing duplicates: {len(df)}")

# Validate categorical columns
categorical_columns = ['procedure_description', 'reason_description_for_procedure']
for col in categorical_columns:
    distinct_vals = df[col].unique() 
    tot_len = len(distinct_vals)
    if tot_len < 12:
        print(f'Unique values in {col}: {df[col].unique()}')
    else:
        print(f'Unique values too large for display: {col}: has total {tot_len} distinct values')



 (INEFFICIENT FOR LARGE DATAFRAMES) Total length of dataframe BEFORE removing duplicates: 17784
 (INEFFICIENT FOR LARGE DATAFRAMES) Total length of dataframe AFTER removing duplicates: 17784
Unique values too large for display: procedure_description: has total 225 distinct values
Unique values too large for display: reason_description_for_procedure: has total 60 distinct values


# Data Engineering
1. Do the logical test (start_time < stop_time)
2. Impute new columns like length of stay

In [8]:
# Logical Testing
# end_time > start_time
valid_dates = df[df['stop_time'] >= df['start_time']]
invalid_dates = df[df['stop_time'] < df['start_time']]
print(f"total entries in df: {len(df)}, valid_dates: {len(valid_dates)}, invalid_dates:{len(invalid_dates)}  ")

# Length of Stay in hours
df['length_of_procedure_in_hours'] = (df['stop_time'] - df['start_time']).dt.total_seconds() / 3600.0


total entries in df: 17784, valid_dates: 17784, invalid_dates:0  


In [9]:
df.head(5)

Unnamed: 0,start_time,stop_time,patient_id,encounter_id,system,snomed_code,procedure_description,base_cost,reason_code_for_procedure,reason_description_for_procedure,length_of_procedure_in_hours
0,2015-09-28 09:04:48,2015-09-28 09:34:48,30a6452c-4297-a1ac-977a-6a23237c7b46,953c5138-ce17-4084-3432-1ac23f184528,http://snomed.info/sct,713021002,Plain X-ray of pelvis (procedure),431.4,Unknown,Unknown,0.5
1,2015-09-28 09:04:48,2015-09-28 11:02:48,30a6452c-4297-a1ac-977a-6a23237c7b46,953c5138-ce17-4084-3432-1ac23f184528,http://snomed.info/sct,305428000,Admission to orthopedic department (procedure),431.4,359817006.0,Closed fracture of hip (disorder),1.966667
2,2016-04-10 09:04:48,2016-04-10 09:19:48,30a6452c-4297-a1ac-977a-6a23237c7b46,0b03e41b-06a6-66fa-b972-acc5a83b134a,http://snomed.info/sct,430193006,Medication reconciliation (procedure),852.4,Unknown,Unknown,0.25
3,2016-04-10 09:04:48,2016-04-10 10:00:45,30a6452c-4297-a1ac-977a-6a23237c7b46,0b03e41b-06a6-66fa-b972-acc5a83b134a,http://snomed.info/sct,710824005,Assessment of health and social care needs (pr...,431.4,Unknown,Unknown,0.9325
4,2016-04-10 10:00:45,2016-04-10 10:22:03,30a6452c-4297-a1ac-977a-6a23237c7b46,0b03e41b-06a6-66fa-b972-acc5a83b134a,http://snomed.info/sct,710841007,Assessment of anxiety (procedure),431.4,Unknown,Unknown,0.355


# Left Join with patients.csv to include:
1. Demographics of patient
2. Other relevant information of patient
3. Impute age of patient
4. Impute age category of patient

In [10]:
df_patients.head(5)

Unnamed: 0,id,birthdate,deathdate,ssn,drivers,passport,prefix,firstname,middlename,lastname,...,state,county,fips,zip,lat,lon,healthcare_expenses,healthcare_coverage,income,income_category
0,30a6452c-4297-a1ac-977a-6a23237c7b46,1994-02-06,,999-52-8591,S99996852,X47758697X,Mr.,Joshua658,Alvin56,Kunde533,...,Massachusetts,Norfolk County,25021.0,2184,42.211142,-71.045802,56904.96,18019.99,100511,high-income
1,34a4dcc4-35fb-6ad5-ab98-be285c586a4f,1968-08-06,2009-12-11,999-75-3953,S99993577,X28173268X,Mr.,Bennie663,Unknown,Ebert178,...,Massachusetts,Norfolk County,25021.0,2184,42.25542,-70.971016,124024.12,1075.06,49737,medium-income
2,7179458e-d6e3-c723-2530-d4acfe1c2668,2008-12-21,,999-70-1925,Unknown,Unknown,Unknown,Hunter736,Mckinley734,Gerlach374,...,Massachusetts,Plymouth County,Unknown,0,41.648292,-70.850619,45645.06,6154.94,133816,high-income
3,37c177ea-4398-fb7a-29fa-70eb3d673876,1994-01-27,,999-27-9779,S99995100,X83694889X,Mrs.,Carlyn477,Florencia449,Williamson769,...,Massachusetts,Plymouth County,Unknown,0,41.789096,-70.711616,12895.15,659951.61,17382,low-income
4,0fef2411-21f0-a269-82fb-c42b55471405,2019-07-27,,999-50-8977,Unknown,Unknown,Unknown,Robin66,Jeramy610,Gleichner915,...,Massachusetts,Essex County,Unknown,0,42.734183,-70.97641,18500.02,5493.57,52159,medium-income


In [11]:
# Left Join with Patients
patient_columns_needed = ['id', 'birthdate', 'marital', 'race', 'ethnicity', 'gender', 'income', 'income_category']
df_patients = df_patients[patient_columns_needed]
# df_patients.head(5)
print(f"len of df BEFORE left-join: {len(df)}")
df = pd.merge(df, df_patients, how='left', left_on='patient_id', right_on='id')

# remove 'id' from right df
df.drop(['id'], axis=1, inplace=True)

#results.head(5)
print(f"len of df AFTER left-join: {len(df)}")

# Impute Age of Patient
df['birthdate'] = pd.to_datetime(df['birthdate'], errors='coerce')
df['start_time'] = pd.to_datetime(df['start_time'], errors='coerce')
df['age_of_patient'] = (df['start_time'] - df['birthdate']).dt.days / 365.0

# Categorize Age
bins = [0, 17, 65, float("inf")]
labels = ["children", "adult", "senior"]

# Create a new column for income category
df["age_category"] = pd.cut(df["age_of_patient"], bins=bins, labels=labels, right=False)
# df.dtypes
# df.head(5)

len of df BEFORE left-join: 17784
len of df AFTER left-join: 17784


In [12]:
df.head(5)

Unnamed: 0,start_time,stop_time,patient_id,encounter_id,system,snomed_code,procedure_description,base_cost,reason_code_for_procedure,reason_description_for_procedure,length_of_procedure_in_hours,birthdate,marital,race,ethnicity,gender,income,income_category,age_of_patient,age_category
0,2015-09-28 09:04:48,2015-09-28 09:34:48,30a6452c-4297-a1ac-977a-6a23237c7b46,953c5138-ce17-4084-3432-1ac23f184528,http://snomed.info/sct,713021002,Plain X-ray of pelvis (procedure),431.4,Unknown,Unknown,0.5,1994-02-06,M,white,nonhispanic,M,100511,high-income,21.654795,adult
1,2015-09-28 09:04:48,2015-09-28 11:02:48,30a6452c-4297-a1ac-977a-6a23237c7b46,953c5138-ce17-4084-3432-1ac23f184528,http://snomed.info/sct,305428000,Admission to orthopedic department (procedure),431.4,359817006.0,Closed fracture of hip (disorder),1.966667,1994-02-06,M,white,nonhispanic,M,100511,high-income,21.654795,adult
2,2016-04-10 09:04:48,2016-04-10 09:19:48,30a6452c-4297-a1ac-977a-6a23237c7b46,0b03e41b-06a6-66fa-b972-acc5a83b134a,http://snomed.info/sct,430193006,Medication reconciliation (procedure),852.4,Unknown,Unknown,0.25,1994-02-06,M,white,nonhispanic,M,100511,high-income,22.189041,adult
3,2016-04-10 09:04:48,2016-04-10 10:00:45,30a6452c-4297-a1ac-977a-6a23237c7b46,0b03e41b-06a6-66fa-b972-acc5a83b134a,http://snomed.info/sct,710824005,Assessment of health and social care needs (pr...,431.4,Unknown,Unknown,0.9325,1994-02-06,M,white,nonhispanic,M,100511,high-income,22.189041,adult
4,2016-04-10 10:00:45,2016-04-10 10:22:03,30a6452c-4297-a1ac-977a-6a23237c7b46,0b03e41b-06a6-66fa-b972-acc5a83b134a,http://snomed.info/sct,710841007,Assessment of anxiety (procedure),431.4,Unknown,Unknown,0.355,1994-02-06,M,white,nonhispanic,M,100511,high-income,22.189041,adult


In [13]:
# Save the cleaned and transformed dataset
df.to_csv(filepath_output, index=False)
print(f'Processed data saved to {filepath_output}')

Processed data saved to ../../data/processed_data/processed_procedures.csv
