In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy import stats

Reading data from the SPARCS dataset into a dataframe

In [2]:
data = pd.read_csv("../../datasets/electronic-health-record/input/electronic_health_record_dataset.csv", low_memory=False)

In [3]:
health_data = data.copy()

In [4]:
health_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 33 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Hospital Service Area                24920 non-null  object 
 1   Hospital County                      24920 non-null  object 
 2   Operating Certificate Number         24915 non-null  float64
 3   Permanent Facility Id                24920 non-null  float64
 4   Facility Name                        25000 non-null  object 
 5   Age Group                            25000 non-null  object 
 6   Zip Code - 3 digits                  24460 non-null  object 
 7   Gender                               25000 non-null  object 
 8   Race                                 25000 non-null  object 
 9   Ethnicity                            25000 non-null  object 
 10  Length of Stay                       25000 non-null  object 
 11  Type of Admission           

Attributes and unique value present in them

In [5]:
def print_unique_values(dataframe):
    for col in dataframe.columns:
        print(f"{col}:\n{dataframe[col].unique()}", end="\n\n")

In [6]:
print_unique_values(health_data)

Hospital Service Area:
['New York City' 'Central NY' 'Finger Lakes' 'Western NY' 'Long Island'
 'Hudson Valley' 'Capital/Adirond' 'Southern Tier' nan]

Hospital County:
['Manhattan' 'Tompkins' 'Kings' 'Chemung' 'Erie' 'Onondaga' 'Wayne'
 'Nassau' 'Westchester' 'Suffolk' 'Richmond' 'Bronx' 'Monroe' 'Albany'
 'Broome' 'Queens' 'Genesee' 'Orange' 'Rockland' 'Sullivan' 'Chautauqua'
 'Otsego' nan 'Oneida' 'Clinton' 'Warren' 'Dutchess' 'Schenectady'
 'Madison' 'Niagara' 'Cayuga' 'Jefferson' 'Steuben' 'St Lawrence' 'Fulton'
 'Saratoga' 'Livingston' 'Ontario' 'Putnam' 'Cattaraugus' 'Franklin'
 'Cortland' 'Oswego' 'Allegany' 'Chenango' 'Wyoming' 'Montgomery' 'Ulster'
 'Columbia' 'Orleans' 'Lewis' 'Schoharie' 'Yates' 'Schuyler' 'Herkimer'
 'Essex' 'Delaware']

Operating Certificate Number:
[7002054. 5401001. 7001021.  701000. 7002053. 1401013. 3301008. 7001020.
 1401014. 5820000. 2908000. 5907002. 5154001. 5154000. 7004003. 7000006.
 2951001. 5151001. 2701005. 5153000. 3301007. 7002024. 7002032.

Checking for exact duplicate records and removing them

In [7]:
print(f"Duplicate records: {health_data.duplicated().sum()}")
health_data = health_data.drop_duplicates()

Duplicate records: 1


Treating missing values

Dropping columns having more than 50% missing values 

In [8]:
missing_values_percentage = (health_data.isnull().mean() * 100)
print(missing_values_percentage.sort_values(ascending=False))

columns_to_drop = missing_values_percentage[missing_values_percentage > 50].index

health_data = health_data.drop(columns_to_drop, axis=1)

Payment Typology 3                     87.019481
Birth Weight                           86.115445
Payment Typology 2                     57.710308
CCSR Procedure Code                    26.281051
CCSR Procedure Description             26.281051
Zip Code - 3 digits                     2.160086
Operating Certificate Number            0.340014
Hospital County                         0.320013
Hospital Service Area                   0.320013
Permanent Facility Id                   0.320013
Total Costs                             0.188008
APR Risk of Mortality                   0.036001
APR Severity of Illness Description     0.036001
Length of Stay                          0.000000
Total Charges                           0.000000
Emergency Department Indicator          0.000000
Facility Name                           0.000000
Age Group                               0.000000
Payment Typology 1                      0.000000
APR Medical Surgical Description        0.000000
APR Severity of Illn

Dropping ['Operating Certificate Number','Discharge Year'] since they don't have any significant importance in the prediction and analysis

In [9]:
health_data = health_data.drop(['Operating Certificate Number','Discharge Year'], axis = 1)

Imputing missing values in 
- CCSR Procedure Code on the basis of mode of its grouped CCSR Diagnosis Code
- CCSR Procedure Description on the basis of mode of its grouped CCSR Procedure Code

In [10]:
def map_and_fill_missing_values(dataframe, from_col, to_col):
   temp_df = dataframe.copy()

   map = temp_df.groupby(from_col)[to_col].agg(
      lambda x: x.value_counts().idxmax() if x.count() > 0 else None
   ).to_dict()
    
   temp_df[to_col] = temp_df[to_col].fillna(temp_df[from_col].map(map))
   
   return temp_df

In [11]:
health_data = map_and_fill_missing_values(health_data, "CCSR Diagnosis Code", "CCSR Procedure Code")

In [12]:
health_data = map_and_fill_missing_values(health_data, "CCSR Procedure Code", "CCSR Procedure Description")

Checking for missing values

In [13]:
health_data.isna().sum()

Hospital Service Area                   80
Hospital County                         80
Permanent Facility Id                   80
Facility Name                            0
Age Group                                0
Zip Code - 3 digits                    540
Gender                                   0
Race                                     0
Ethnicity                                0
Length of Stay                           0
Type of Admission                        0
Patient Disposition                      0
CCSR Diagnosis Code                      0
CCSR Diagnosis Description               0
CCSR Procedure Code                     16
CCSR Procedure Description              16
APR DRG Code                             0
APR DRG Description                      0
APR MDC Code                             0
APR MDC Description                      0
APR Severity of Illness Code             0
APR Severity of Illness Description      9
APR Risk of Mortality                    9
APR Medical

Dropping records having missing values since the count is negligible

In [14]:
health_data = health_data.dropna(axis=0)

health_data.isna().sum()

Hospital Service Area                  0
Hospital County                        0
Permanent Facility Id                  0
Facility Name                          0
Age Group                              0
Zip Code - 3 digits                    0
Gender                                 0
Race                                   0
Ethnicity                              0
Length of Stay                         0
Type of Admission                      0
Patient Disposition                    0
CCSR Diagnosis Code                    0
CCSR Diagnosis Description             0
CCSR Procedure Code                    0
CCSR Procedure Description             0
APR DRG Code                           0
APR DRG Description                    0
APR MDC Code                           0
APR MDC Description                    0
APR Severity of Illness Code           0
APR Severity of Illness Description    0
APR Risk of Mortality                  0
APR Medical Surgical Description       0
Payment Typology

Imputing missing/unknown values of ["Gender", "Type of Admission"] columns with corresponding mode value

In [15]:
def impute_missing_values_with_mode(dataframe, column, missing_value=None):
    temp_df = dataframe.copy()
    mode_value = temp_df[column].mode()[0]
    temp_df[column] = temp_df[column].replace(missing_value, mode_value)
    return temp_df

In [16]:
health_data = impute_missing_values_with_mode(health_data, "Gender", missing_value="U")

In [17]:
health_data = impute_missing_values_with_mode(health_data, "Type of Admission", missing_value="Not Available")

Treating unknown ethinicity using race

In [18]:
mode_race_for_unknown_ethinicity = health_data[health_data["Ethnicity"] == "Unknown"]["Race"].mode()[0]
mode_ethinicity_for_selection = health_data[health_data["Race"] == mode_race_for_unknown_ethinicity]["Ethnicity"].mode()[0]
health_data["Ethnicity"] = health_data["Ethnicity"].replace("Unknown", mode_ethinicity_for_selection)

Converting to different data types:
- Limiting "120 +" stay duration to 120 and converting column to integer type
- Replacing "OOS" zip code with largest 3 digit integer and converting values to int type
- Converting Permanent Facility Id to type int

In [19]:
health_data["Length of Stay"] = health_data["Length of Stay"].replace("120 +", "120").astype(int)

In [20]:
health_data["Zip Code - 3 digits"] = health_data["Zip Code - 3 digits"].replace("OOS", "999").astype(int)

In [21]:
health_data["Permanent Facility Id"] = health_data["Permanent Facility Id"].astype(int)

In [22]:
health_data["Total Charges"] = health_data["Total Charges"].replace(",", "", regex=True).astype(float)

In [23]:
health_data["Total Costs"] = health_data["Total Costs"].replace(",", "", regex=True).astype(float)

In [24]:
health_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24391 entries, 0 to 24999
Data columns (total 28 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Hospital Service Area                24391 non-null  object 
 1   Hospital County                      24391 non-null  object 
 2   Permanent Facility Id                24391 non-null  int32  
 3   Facility Name                        24391 non-null  object 
 4   Age Group                            24391 non-null  object 
 5   Zip Code - 3 digits                  24391 non-null  int32  
 6   Gender                               24391 non-null  object 
 7   Race                                 24391 non-null  object 
 8   Ethnicity                            24391 non-null  object 
 9   Length of Stay                       24391 non-null  int32  
 10  Type of Admission                    24391 non-null  object 
 11  Patient Disposition              

Normally distributing ["Total Charges", "Total Costs"] using Boxcox Power Transformation

In [25]:
health_data["Total Charges"] = stats.boxcox(health_data["Total Charges"])[0]
health_data["Total Costs"] = stats.boxcox(health_data["Total Costs"])[0]

Dealing with Outliers

In [26]:
def detect_and_impute_outliers(dataframe, column):
    temp_df = dataframe.copy()

    q1 = dataframe[column].quantile(0.25)
    q3 = dataframe[column].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outlier_count = temp_df[(temp_df[column] < lower_bound) | (temp_df[column] > upper_bound)].shape[0]
    print(f"Number of outliers in {column}: {outlier_count}")

    temp_df[column] = temp_df[column].mask(temp_df[column] < lower_bound, lower_bound, axis=0)
    temp_df[column] = temp_df[column].mask(temp_df[column] > upper_bound, upper_bound, axis=0)
    return temp_df

In [27]:
health_data = detect_and_impute_outliers(health_data, "Total Charges")

Number of outliers in Total Charges: 396


In [28]:
health_data = detect_and_impute_outliers(health_data, "Total Costs")

Number of outliers in Total Costs: 443


In [29]:
health_data.head()

Unnamed: 0,Hospital Service Area,Hospital County,Permanent Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,Length of Stay,...,APR MDC Code,APR MDC Description,APR Severity of Illness Code,APR Severity of Illness Description,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Emergency Department Indicator,Total Charges,Total Costs
0,New York City,Manhattan,1458,New York-Presbyterian Hospital - New York Weil...,0 to 17,104,F,Other Race,Spanish/Hispanic,10,...,15,NEWBORNS AND OTHER NEONATES WITH CONDITIONS OR...,3,Major,Minor,Medical,Private Health Insurance,N,9.872085,9.072055
1,New York City,Manhattan,1464,New York-Presbyterian Hospital - Columbia Pres...,0 to 17,109,M,White,Not Span/Hispanic,4,...,4,DISEASES AND DISORDERS OF THE RESPIRATORY SYSTEM,3,Major,Moderate,Medical,Medicaid,Y,9.170213,8.330147
2,Central NY,Tompkins,977,Cayuga Medical Center at Ithaca,0 to 17,148,M,Other Race,Not Span/Hispanic,3,...,15,NEWBORNS AND OTHER NEONATES WITH CONDITIONS OR...,2,Moderate,Minor,Medical,Self-Pay,N,7.034587,6.753241
3,New York City,Kings,1306,New York - Presbyterian Brooklyn Methodist Hos...,0 to 17,112,F,Black/African American,Not Span/Hispanic,1,...,9,"DISEASES AND DISORDERS OF THE SKIN, SUBCUTANEO...",1,Minor,Minor,Medical,Medicaid,Y,7.858378,6.952654
4,Finger Lakes,Chemung,116,Arnot Ogden Medical Center,0 to 17,148,F,White,Not Span/Hispanic,1,...,15,NEWBORNS AND OTHER NEONATES WITH CONDITIONS OR...,1,Minor,Minor,Medical,Medicaid,N,6.783993,6.070277


Saving data for analytics

In [30]:
health_data.to_csv('../../datasets/electronic-health-record/output/transformed_electronic_health_record_dataset_for_analytics.csv', index=False)

Dropping redundant columns

In [31]:
for col in health_data.columns:
    print(f"{col}: {health_data[col].nunique()}")

Hospital Service Area: 8
Hospital County: 56
Permanent Facility Id: 191
Facility Name: 190
Age Group: 5
Zip Code - 3 digits: 50
Gender: 2
Race: 4
Ethnicity: 3
Length of Stay: 106
Type of Admission: 5
Patient Disposition: 19
CCSR Diagnosis Code: 376
CCSR Diagnosis Description: 376
CCSR Procedure Code: 290
CCSR Procedure Description: 290
APR DRG Code: 320
APR DRG Description: 320
APR MDC Code: 24
APR MDC Description: 24
APR Severity of Illness Code: 4
APR Severity of Illness Description: 4
APR Risk of Mortality: 4
APR Medical Surgical Description: 2
Payment Typology 1: 9
Emergency Department Indicator: 2
Total Charges: 23734
Total Costs: 23656


In [32]:
redundant_columns = ["Facility Name", "CCSR Diagnosis Description", "CCSR Procedure Description", "APR DRG Description", "APR MDC Description", "APR Severity of Illness Description"]
health_data = health_data.drop(redundant_columns, axis=1)

In [33]:
for col in health_data.columns:
    if health_data[col].dtype == "object":
        print(f"{col}: {health_data[col].nunique()}")

Hospital Service Area: 8
Hospital County: 56
Age Group: 5
Gender: 2
Race: 4
Ethnicity: 3
Type of Admission: 5
Patient Disposition: 19
CCSR Diagnosis Code: 376
CCSR Procedure Code: 290
APR Risk of Mortality: 4
APR Medical Surgical Description: 2
Payment Typology 1: 9
Emergency Department Indicator: 2


Encoding categorical data:
- Target Encoding: ["Hospital Service Area", "Hospital County", "Patient Disposition", "CCSR Diagnosis Code", "CCSR Procedure Code"]
- Label Encoding: ['Age Group']
- Direct mapping: ['APR Risk of Mortality', 'Payment Typology 1']
- One Hot Encoding: ["Gender", "Race", "Ethnicity", "Type of Admission", "APR Medical Surgical Description", "Emergency Department Indicator"]

In [34]:
target_encoding_columns = ["Hospital Service Area", "Hospital County", "Patient Disposition", "CCSR Diagnosis Code", "CCSR Procedure Code"]
for col in target_encoding_columns:
    health_data[col] = health_data[col].map(dict(health_data.groupby(col)["Length of Stay"].mean()))

In [35]:
health_data['Age Group'] = LabelEncoder().fit_transform(health_data['Age Group'])

In [36]:
health_data['APR Risk of Mortality'] = health_data['APR Risk of Mortality'].map({'Minor': 0, 'Moderate': 1, 'Major': 2, 'Extreme': 3})

In [37]:
temp_dict = dict(health_data['Payment Typology 1'].value_counts() / len(health_data))
health_data['Payment Typology 1'] = health_data['Payment Typology 1'].map(temp_dict)

In [38]:
for col in health_data.columns:
    if health_data[col].dtype == "object":
        print(f"{col}: {health_data[col].nunique()}")

Gender: 2
Race: 4
Ethnicity: 3
Type of Admission: 5
APR Medical Surgical Description: 2
Emergency Department Indicator: 2


In [39]:
def one_hot_encoding(dataframe, columns):
    encoded_df = pd.get_dummies(dataframe, columns=columns)
    return encoded_df

In [40]:
one_hot_encoding_columns = ["Gender", "Race", "Ethnicity", "Type of Admission", "APR Medical Surgical Description", "Emergency Department Indicator"]

health_data = one_hot_encoding(health_data, one_hot_encoding_columns)

In [41]:
health_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24391 entries, 0 to 24999
Data columns (total 34 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Hospital Service Area                      24391 non-null  float64
 1   Hospital County                            24391 non-null  float64
 2   Permanent Facility Id                      24391 non-null  int32  
 3   Age Group                                  24391 non-null  int32  
 4   Zip Code - 3 digits                        24391 non-null  int32  
 5   Length of Stay                             24391 non-null  int32  
 6   Patient Disposition                        24391 non-null  float64
 7   CCSR Diagnosis Code                        24391 non-null  float64
 8   CCSR Procedure Code                        24391 non-null  float64
 9   APR DRG Code                               24391 non-null  int64  
 10  APR MDC Code               

In [42]:
health_data.to_csv('../../datasets/electronic-health-record/output/transformed_electronic_health_record_dataset.csv', index=False)