# Data Wrangling for patients.csv
This notebook processes the `patients.csv` by performing the following steps:
- Inspecting and cleaning the data
- Handling missing values and inconsistencies
- Transforming the data (e.g., creating new features)
- Saving the processed data to a new file

In [1]:
import pandas as pd
import numpy as np
import yaml
from pprint import pprint
import os
import sys 
from IPython.display import display, JSON

# Add the directory containing utils.py to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../utils")))

# import utils
from utils import func_rename_and_cast_columns

In [2]:
# Load the dataset
filepath_csv = '../../data/raw_data/patients.csv' # Read CSV 
output_path = '../../data/processed_data/processed_patients.csv' # Write processed CSV
filepath_yaml = '../../config/patients.yaml' # Read encounters.yaml, it is used to clean column names and apply relevant types to columns

# Load Dataframe
df_patients = pd.read_csv(filepath_csv)
# Load YAML column mappings 
with open(filepath_yaml, "r") as file:
    dict_column_mappings = yaml.safe_load(file)
    
# Display initial dataset information
print('Initial Dataset Info:')
df_patients.info()
df_patients.head(5)

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   106 non-null    object 
 1   BIRTHDATE            106 non-null    object 
 2   DEATHDATE            6 non-null      object 
 3   SSN                  106 non-null    object 
 4   DRIVERS              84 non-null     object 
 5   PASSPORT             75 non-null     object 
 6   PREFIX               79 non-null     object 
 7   FIRST                106 non-null    object 
 8   MIDDLE               89 non-null     object 
 9   LAST                 106 non-null    object 
 10  SUFFIX               0 non-null      float64
 11  MAIDEN               28 non-null     object 
 12  MARITAL              64 non-null     object 
 13  RACE                 106 non-null    object 
 14  ETHNICITY            106 non-null    object 
 15  GENDER            

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,MIDDLE,LAST,...,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,30a6452c-4297-a1ac-977a-6a23237c7b46,1994-02-06,,999-52-8591,S99996852,X47758697X,Mr.,Joshua658,Alvin56,Kunde533,...,Braintree,Massachusetts,Norfolk County,25021.0,2184,42.211142,-71.045802,56904.96,18019.99,100511
1,34a4dcc4-35fb-6ad5-ab98-be285c586a4f,1968-08-06,2009-12-11,999-75-3953,S99993577,X28173268X,Mr.,Bennie663,,Ebert178,...,Braintree,Massachusetts,Norfolk County,25021.0,2184,42.25542,-70.971016,124024.12,1075.06,49737
2,7179458e-d6e3-c723-2530-d4acfe1c2668,2008-12-21,,999-70-1925,,,,Hunter736,Mckinley734,Gerlach374,...,Mattapoisett,Massachusetts,Plymouth County,,0,41.648292,-70.850619,45645.06,6154.94,133816
3,37c177ea-4398-fb7a-29fa-70eb3d673876,1994-01-27,,999-27-9779,S99995100,X83694889X,Mrs.,Carlyn477,Florencia449,Williamson769,...,Wareham,Massachusetts,Plymouth County,,0,41.789096,-70.711616,12895.15,659951.61,17382
4,0fef2411-21f0-a269-82fb-c42b55471405,2019-07-27,,999-50-8977,,,,Robin66,Jeramy610,Gleichner915,...,Groveland,Massachusetts,Essex County,,0,42.734183,-70.97641,18500.02,5493.57,52159


In [3]:
# pprint(dict_column_mappings)
display(JSON(dict_column_mappings))

<IPython.core.display.JSON object>

# Data Cleaning and Data Quality Checks

1. Standardise column names
2. Apply relevant types
3. Fill null values with relevant values
4. Perform the logical testing (start_time < end_time)

In [4]:
df_patients = func_rename_and_cast_columns(df_patients, dict_column_mappings['columns'])
df_patients.head(5)

Renamed column: 'Id' to 'id'.
Column 'id' cast to type 'str'.
--------------------
Renamed column: 'BIRTHDATE' to 'birthdate'.
Column 'birthdate' cast to type 'datetimestamp'.
--------------------
Renamed column: 'DEATHDATE' to 'deathdate'.
Column 'deathdate' cast to type 'datetimestamp'.
--------------------
Renamed column: 'SSN' to 'ssn'.
Column 'ssn' cast to type 'str'.
--------------------
Renamed column: 'DRIVERS' to 'drivers'.
Column 'drivers' cast to type 'str'.
--------------------
Renamed column: 'PASSPORT' to 'passport'.
Column 'passport' cast to type 'str'.
--------------------
Renamed column: 'PREFIX' to 'prefix'.
Column 'prefix' cast to type 'str'.
--------------------
Renamed column: 'FIRST' to 'firstname'.
Column 'firstname' cast to type 'str'.
--------------------
Renamed column: 'MIDDLE' to 'middlename'.
Column 'middlename' cast to type 'str'.
--------------------
Renamed column: 'LAST' to 'lastname'.
Column 'lastname' cast to type 'str'.
--------------------
Renamed c

Unnamed: 0,id,birthdate,deathdate,ssn,drivers,passport,prefix,firstname,middlename,lastname,...,city,state,county,fips,zip,lat,lon,healthcare_expenses,healthcare_coverage,income
0,30a6452c-4297-a1ac-977a-6a23237c7b46,1994-02-06,NaT,999-52-8591,S99996852,X47758697X,Mr.,Joshua658,Alvin56,Kunde533,...,Braintree,Massachusetts,Norfolk County,25021.0,2184,42.211142,-71.045802,56904.96,18019.99,100511
1,34a4dcc4-35fb-6ad5-ab98-be285c586a4f,1968-08-06,2009-12-11,999-75-3953,S99993577,X28173268X,Mr.,Bennie663,Unknown,Ebert178,...,Braintree,Massachusetts,Norfolk County,25021.0,2184,42.25542,-70.971016,124024.12,1075.06,49737
2,7179458e-d6e3-c723-2530-d4acfe1c2668,2008-12-21,NaT,999-70-1925,Unknown,Unknown,Unknown,Hunter736,Mckinley734,Gerlach374,...,Mattapoisett,Massachusetts,Plymouth County,Unknown,0,41.648292,-70.850619,45645.06,6154.94,133816
3,37c177ea-4398-fb7a-29fa-70eb3d673876,1994-01-27,NaT,999-27-9779,S99995100,X83694889X,Mrs.,Carlyn477,Florencia449,Williamson769,...,Wareham,Massachusetts,Plymouth County,Unknown,0,41.789096,-70.711616,12895.15,659951.61,17382
4,0fef2411-21f0-a269-82fb-c42b55471405,2019-07-27,NaT,999-50-8977,Unknown,Unknown,Unknown,Robin66,Jeramy610,Gleichner915,...,Groveland,Massachusetts,Essex County,Unknown,0,42.734183,-70.97641,18500.02,5493.57,52159


In [5]:
# Check for duplicate rows
print(f"Total length of dataframe BEFORE removing duplicates: {len(df_patients)}")
df_patients = df_patients.drop_duplicates()
print(f"Total length of dataframe AFTER removing duplicates: {len(df_patients)}")

# Validate categorical columns
categorical_columns = ['marital', 'race', 'ethnicity', 'gender']
for col in categorical_columns:
    print(f'Unique values in {col}: {df_patients[col].unique()}')
    
# No need to lowercase them

Total length of dataframe BEFORE removing duplicates: 106
Total length of dataframe AFTER removing duplicates: 106
Unique values in marital: ['M' 'D' 'Unknown' 'W' 'S']
Unique values in race: ['white' 'asian' 'other' 'black' 'native']
Unique values in ethnicity: ['nonhispanic' 'hispanic']
Unique values in gender: ['M' 'F']


# Feature Engineering and Imputation

1. Derive income_category by binning incomes into low, medium and high category

In [6]:
# Categorize income
bins = [0, 45000, 90000, float("inf")]
labels = ["low-income", "medium-income", "high-income"]

# Create a new column for income category
df_patients["income_category"] = pd.cut(df_patients["income"], bins=bins, labels=labels, right=False)

In [7]:
df_patients.head(5)

Unnamed: 0,id,birthdate,deathdate,ssn,drivers,passport,prefix,firstname,middlename,lastname,...,state,county,fips,zip,lat,lon,healthcare_expenses,healthcare_coverage,income,income_category
0,30a6452c-4297-a1ac-977a-6a23237c7b46,1994-02-06,NaT,999-52-8591,S99996852,X47758697X,Mr.,Joshua658,Alvin56,Kunde533,...,Massachusetts,Norfolk County,25021.0,2184,42.211142,-71.045802,56904.96,18019.99,100511,high-income
1,34a4dcc4-35fb-6ad5-ab98-be285c586a4f,1968-08-06,2009-12-11,999-75-3953,S99993577,X28173268X,Mr.,Bennie663,Unknown,Ebert178,...,Massachusetts,Norfolk County,25021.0,2184,42.25542,-70.971016,124024.12,1075.06,49737,medium-income
2,7179458e-d6e3-c723-2530-d4acfe1c2668,2008-12-21,NaT,999-70-1925,Unknown,Unknown,Unknown,Hunter736,Mckinley734,Gerlach374,...,Massachusetts,Plymouth County,Unknown,0,41.648292,-70.850619,45645.06,6154.94,133816,high-income
3,37c177ea-4398-fb7a-29fa-70eb3d673876,1994-01-27,NaT,999-27-9779,S99995100,X83694889X,Mrs.,Carlyn477,Florencia449,Williamson769,...,Massachusetts,Plymouth County,Unknown,0,41.789096,-70.711616,12895.15,659951.61,17382,low-income
4,0fef2411-21f0-a269-82fb-c42b55471405,2019-07-27,NaT,999-50-8977,Unknown,Unknown,Unknown,Robin66,Jeramy610,Gleichner915,...,Massachusetts,Essex County,Unknown,0,42.734183,-70.97641,18500.02,5493.57,52159,medium-income


In [8]:
# Save the cleaned and transformed dataset
df_patients.to_csv(output_path, index=False)
print(f'Processed data saved to {output_path}')

Processed data saved to ../../data/processed_data/processed_patients.csv
