In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, TargetEncoder
from sklearn.feature_selection import mutual_info_regression
from importlib.metadata import version

print("Pandas version: " + str(pd.__version__))
print("Numpy version: " + str(np.__version__))
print("Scikit-learn version: " + str(version("scikit-learn")))

Pandas version: 2.3.3
Numpy version: 2.3.4
Scikit-learn version: 1.7.2


In [2]:
# load post-eda data and mappings
with open('../data/temp/eda_output.pkl', 'rb') as f:
    eda_data = pickle.load(f)

df = eda_data['data']
category_mappings = eda_data['category_mappings']

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7213 entries, 0 to 7212
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   health_service_area                  7213 non-null   object 
 1   hospital_county                      7213 non-null   object 
 2   permanent_facility_id                7213 non-null   object 
 3   age_group                            7213 non-null   object 
 4   zip_code                             7213 non-null   object 
 5   gender                               7213 non-null   object 
 6   race                                 7213 non-null   object 
 7   ethnicity                            7213 non-null   object 
 8   length_of_stay                       7213 non-null   int64  
 9   type_of_admission                    7213 non-null   object 
 10  patient_disposition                  7213 non-null   object 
 11  ccsr_procedure_description    

In [4]:
df.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
health_service_area,7213.0,8.0,New York City,3463.0,,,,,,,
hospital_county,7213.0,33.0,New York,977.0,,,,,,,
permanent_facility_id,7213.0,116.0,Other,485.0,,,,,,,
age_group,7213.0,5.0,70 or Older,3470.0,,,,,,,
zip_code,7213.0,49.0,112,970.0,,,,,,,
gender,7213.0,2.0,F/U,4017.0,,,,,,,
race,7213.0,4.0,White,3613.0,,,,,,,
ethnicity,7213.0,3.0,Not Span/Hispanic,5147.0,,,,,,,
length_of_stay,7213.0,,,,4.653681,6.185733,1.0,2.0,3.0,5.0,120.0
type_of_admission,7213.0,3.0,Emergency,6945.0,,,,,,,


Due to the right-skewed distribution of length of stay, a log transformation was applied to the target variable for modeling. Predictions will be transformed back to the original scale for evaluation.

In [5]:
# split into 80/10/10
X = df.drop(columns = ['length_of_stay', 'log_length_of_stay'])
y = df['log_length_of_stay']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size = 0.1, random_state = 42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size = 0.1111111, random_state = 42  # 0.1 / 0.9 = 0.1111111
)

len(X_train), len(X_val), len(X_test)

(5769, 722, 722)

In [6]:
# all columns in X train are categorical
features = ['health_service_area', 'hospital_county', 'permanent_facility_id',
            'age_group', 'zip_code', 'gender', 
            'race', 'ethnicity', 'type_of_admission', 
            'patient_disposition', 'ccsr_procedure_description', 'apr_drg_description', 
            'apr_mdc_description', 'apr_severity_of_illness_description', 'apr_risk_of_mortality', 
            'apr_medical_surgical_description', 'payment_typology_1', 'payment_typology_2', 
            'payment_typology_3', 'emergency_department_indicator']

In [7]:
# simple differences and ratios analysis (effect sizes analysis)
def categorical_features_summary(X, y, features, decimals=4):
    global_mean = round(float(y.mean()), decimals)
    df = X.copy()
    df['_target_'] = y.values
    
    for col in features:
        grouped = df.groupby(col)['_target_'].agg(['count', 'mean'])
        grouped = grouped.rename(columns={'mean': 'target_mean'})
        
        # Add percentage column
        total_count = grouped['count'].sum()
        grouped['percent'] = (grouped['count'] / total_count * 100).round(2)
        
        grouped['target_mean'] = grouped['target_mean'].round(decimals)
        grouped['diff_from_global_mean'] = (grouped['target_mean'] - global_mean).round(decimals)
        grouped['ratio_to_global_mean'] = (grouped['target_mean'] / global_mean).round(decimals)
        
        summary = grouped.reset_index()
        summary = summary.sort_values('target_mean', ascending=False)
        
        print(f"--- {col} ---")
        print(summary.to_string(index=False), "\n")

In [8]:
categorical_features_summary(
    X = X_train,
    y = y_train,
    features = features,
    decimals = 4
)

--- health_service_area ---
health_service_area  count  target_mean  percent  diff_from_global_mean  ratio_to_global_mean
      Hudson Valley    623       1.2980    10.80                 0.1238                1.1054
Capital/Adirondacks    275       1.2376     4.77                 0.0634                1.0540
       Finger Lakes    393       1.2258     6.81                 0.0516                1.0439
         Central NY    260       1.1960     4.51                 0.0218                1.0186
         Western NY    388       1.1672     6.73                -0.0070                0.9940
        Long Island    989       1.1499    17.14                -0.0243                0.9793
Southern Tier/Other     67       1.1477     1.16                -0.0265                0.9774
      New York City   2774       1.1411    48.08                -0.0331                0.9718 

--- hospital_county ---
hospital_county  count  target_mean  percent  diff_from_global_mean  ratio_to_global_mean
         O

In [9]:
# MI scores
X_train_encoded = X_train.copy()
for col in features:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col])

mi_scores = mutual_info_regression(X_train_encoded, y_train, discrete_features = True)
mi = pd.Series(mi_scores, index = X_train.columns).sort_values(ascending = False)
print(mi)

patient_disposition                    0.117137
apr_severity_of_illness_description    0.107915
apr_risk_of_mortality                  0.102504
ccsr_procedure_description             0.052794
permanent_facility_id                  0.041857
age_group                              0.040202
apr_drg_description                    0.030254
payment_typology_1                     0.027595
payment_typology_2                     0.024780
apr_medical_surgical_description       0.017885
race                                   0.016149
hospital_county                        0.014483
apr_mdc_description                    0.010603
gender                                 0.008564
ethnicity                              0.005608
payment_typology_3                     0.005247
health_service_area                    0.005121
emergency_department_indicator         0.004925
zip_code                               0.004727
type_of_admission                      0.001388
dtype: float64


MI scores for `type_of_admission`, `ethnicity`, and `gender` are effectively zero. For `type_of_admission` and `ethnicity`, this reflects large class imbalance (96% and 72% in dominant categories) combined with small effect sizes. For `gender`, univariate analysis shows minimal differences in mean log(LoS) across categories.

In [10]:
level_counts = X_train[features].nunique()

print(level_counts)

health_service_area                      8
hospital_county                         33
permanent_facility_id                  116
age_group                                5
zip_code                                49
gender                                   2
race                                     4
ethnicity                                3
type_of_admission                        3
patient_disposition                     11
ccsr_procedure_description              20
apr_drg_description                      4
apr_mdc_description                      3
apr_severity_of_illness_description      4
apr_risk_of_mortality                    4
apr_medical_surgical_description         2
payment_typology_1                       7
payment_typology_2                       9
payment_typology_3                       6
emergency_department_indicator           2
dtype: int64


In [11]:
level_counts = X_val[features].nunique()

print(level_counts)

health_service_area                      8
hospital_county                         33
permanent_facility_id                  115
age_group                                5
zip_code                                49
gender                                   2
race                                     4
ethnicity                                3
type_of_admission                        3
patient_disposition                     10
ccsr_procedure_description              18
apr_drg_description                      4
apr_mdc_description                      3
apr_severity_of_illness_description      4
apr_risk_of_mortality                    4
apr_medical_surgical_description         2
payment_typology_1                       7
payment_typology_2                       9
payment_typology_3                       6
emergency_department_indicator           2
dtype: int64


In [12]:
level_counts = X_test[features].nunique()

print(level_counts)

health_service_area                      8
hospital_county                         33
permanent_facility_id                  116
age_group                                5
zip_code                                49
gender                                   2
race                                     4
ethnicity                                3
type_of_admission                        3
patient_disposition                     11
ccsr_procedure_description              19
apr_drg_description                      4
apr_mdc_description                      3
apr_severity_of_illness_description      4
apr_risk_of_mortality                    4
apr_medical_surgical_description         2
payment_typology_1                       7
payment_typology_2                       9
payment_typology_3                       6
emergency_department_indicator           2
dtype: int64


In [13]:
# target encoding

# initialize; target_type = 'continuous' for regression; smooth ='auto' handles unseen categories and adds regularization
encoder = TargetEncoder(target_type = 'continuous', smooth = 'auto', random_state = 42)

# fit on training data only
X_train_encoded = encoder.fit_transform(X_train, y_train)

# transform validation and test (handles unseen categories automatically)
X_val_encoded = encoder.transform(X_val)
X_test_encoded = encoder.transform(X_test)

# convert back to DataFrames
X_train_encoded = pd.DataFrame(X_train_encoded, columns = X_train.columns, index = X_train.index)
X_val_encoded = pd.DataFrame(X_val_encoded, columns = X_val.columns, index = X_val.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns = X_test.columns, index = X_test.index)

In [14]:
target_encoded_data = {
    # data
    'X_train': X_train_encoded,
    'X_val': X_val_encoded,
    'X_test': X_test_encoded,
    'y_train': y_train,
    'y_val': y_val,
    'y_test': y_test,

    # original, non-encoded features
    'X_train_orig': X_train,
    'X_val_orig': X_val,
    'X_test_orig': X_test,

    # metadata
    'feature_names': X_train.columns.tolist(),
    'category_mappings': category_mappings,

    # encoding-specific information
    'encoder': encoder,
    'encoding_method': 'standard_target_encoding'    
}

with open('../data/temp/target_encoded_data.pkl', 'wb') as f:
    pickle.dump(target_encoded_data, f)

In [15]:
# custom target encoding (mean * median)

stats = {}  # dictionary to store mean*median per column

X_train_encoded = pd.DataFrame(index = X_train.index)
for col in features:
    grouped = y_train.groupby(X_train[col]).agg(['mean', 'median'])
    grouped['mean_median'] = grouped['mean'] * grouped['median']
    
    # store the values
    stats[col] = grouped['mean_median'].to_dict()
    
    # apply to training data
    X_train_encoded[col] = X_train[col].map(grouped['mean_median'])

# handle unseen categories (use global mean*median as fallback)
global_mean = y_train.mean()
global_median = y_train.median()
global_encoding = global_mean * global_median

# apply to validation data
X_val_encoded = pd.DataFrame(index = X_val.index)
for col in features:
    X_val_encoded[col] = X_val[col].map(stats[col]).fillna(global_encoding)

# apply to test data
X_test_encoded = pd.DataFrame(index = X_test.index)
for col in features:
    X_test_encoded[col] = X_test[col].map(stats[col]).fillna(global_encoding)

In [16]:
# save data with custom encoding
custom_target_encoded_data = {
    # data
    'X_train': X_train_encoded,
    'X_val': X_val_encoded,
    'X_test': X_test_encoded,
    'y_train': y_train,
    'y_val': y_val,
    'y_test': y_test,

    # metadata
    'feature_names': X_train.columns.tolist(),
    'category_mappings': category_mappings,

    # encoding-specific information
    'stats': stats,  # custom encoding mappings
    'global_encoding': global_encoding,  # fallback for unseen categories
    'encoding_method': 'mean_times_median_encoding'
}

with open('../data/temp/custom_target_encoded_data.pkl', 'wb') as f:
    pickle.dump(custom_target_encoded_data, f)