In [1]:
import pandas as pd
import janitor
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from importlib.metadata import version

print("Pandas version: " + str(pd.__version__))
print("Janitor version: " + str(janitor.__version__))
print("Numpy version: " + str(np.__version__))
print("Fastparquet version: " + str(version("fastparquet")))
print("Scikit-learn version: " + str(version("scikit-learn")))

Pandas version: 2.3.3
Janitor version: 0.32.1
Numpy version: 2.3.4
Fastparquet version: 2024.11.0
Scikit-learn version: 1.7.2


In [2]:
# define column data types
dtype_map = {
    "Health Service Area": "string",
    "Hospital County": "string",
    "Operating Certificate Number": "string",
    "Permanent Facility Id": "string",
    "Facility Name": "string",
    "Age Group": "string",
    "Zip Code": "string",
    "Gender": "string",
    "Race": "string",
    "Ethnicity": "string",
    "Length of Stay": "string",
    "Type of Admission": "string",
    "Patient Disposition": "string",
    "Discharge Year": "string",
    "CCSR Diagnosis Code": "string",
    "CCSR Diagnosis Description": "string",
    "CCSR Procedure Code": "string",
    "CCSR Procedure Description": "string",
    "APR DRG Code": "string",
    "APR DRG Description": "string",
    "APR MDC Code": "string",
    "APR MDC Description": "string",
    "APR Severity of Illness Code": "string",
    "APR Severity of Illness Description": "string",
    "APR Risk of Mortality": "string",
    "APR Medical Surgical Description": "string",
    "Payment Typology 1": "string",
    "Payment Typology 2": "string",
    "Payment Typology 3": "string",
    "Birth Weight": "string",
    "Emergency Department Indicator": "string",
    "Total Charges": "float64",
    "Total Costs": "float64"
}

In [3]:
# read in data, clean names, subset, and print first 5 rows
df_in = pd.read_csv('../data/untouched/Hospital_Inpatient_Discharges_(SPARCS_De-Identified)__2024_20251106.csv', dtype = dtype_map)

df_in = df_in.clean_names()

df_in = df_in[df_in['ccsr_diagnosis_description'] == 'INFLUENZA'].reset_index(drop = True)

In [4]:
df_clean = df_in.copy()

# replace missing facility characteristics with "NAA" (i.e., N/A abortion-related)
missing_cols_abortion = ['health_service_area', 'hospital_county', 'operating_certificate_number', 
                         'permanent_facility_id', 'zip_code']

df_clean.loc[df_clean['facility_name'] == 'Redacted for Confidentiality', missing_cols_abortion] = (
    df_clean.loc[df_clean['facility_name'] == 'Redacted for Confidentiality', missing_cols_abortion].fillna('NAA')
)

# replace other missing zip_code values with "NAS" (i.e., N/A small sample)
df_clean.loc[df_clean['facility_name'] != 'Redacted for Confidentiality', 'zip_code'] = \
    df_clean.loc[df_clean['facility_name'] != 'Redacted for Confidentiality', 'zip_code'].fillna('NAS')

# convert 120+ length_of_stay values to 120
df_clean['length_of_stay'] = df_clean['length_of_stay'].replace('120+', 120).astype('int64')

# payment_typology_2, payment_typology_3, ccsr_procedure_description
df_clean['payment_typology_2'] = df_clean['payment_typology_2'].fillna("None")
df_clean['payment_typology_3'] = df_clean['payment_typology_3'].fillna("None")
df_clean['ccsr_procedure_description'] =df_clean['ccsr_procedure_description'].fillna("None")

In [5]:
"""drop redundent and unusable columns
   - operating_certificate_number and facility_name are redundant with permanent_facility_id and not as granular, keep permanent_facility_id only
   - discharge_year, ccsr_diagnosis_code, and ccsr_diagnosis_description have no variation (only 1 value)
   - ccsr_procedure_code, apr_drg_code, apr_mdc_code, and apr_severity_of_illness_code are redundent
   - birth_weight is 99.5% missing
   - total_charges and total_costs will be dropped because they would not be known during the visit and are partially derived from length of stay
"""
df_clean.drop(['operating_certificate_number', 'facility_name', 'discharge_year', 
               'ccsr_procedure_code', 'apr_drg_code', 'apr_mdc_code', 'apr_severity_of_illness_code', 
               'ccsr_diagnosis_code', 'ccsr_diagnosis_description', 'birth_weight', 
               'total_charges', 'total_costs'], axis = 1, inplace = True)

In [6]:
df_clean['log_length_of_stay'] = np.log(df_clean['length_of_stay'])

In [7]:
def create_small_group_mapping(df, feature, min_count = 20):
    """combine all small categories into 'Other'"""
    value_counts = df[feature].value_counts()
    small_categories = value_counts[value_counts < min_count].index.tolist()
    
    return {cat: 'Other' for cat in small_categories}

In [8]:
category_mappings = {}

category_mappings = {
    'health_service_area': {
        'Southern Tier': 'Southern Tier/Other',
        'NAA':           'Southern Tier/Other'        
    },
    'gender': {
        'F': 'F/U',
        'U': 'F/U'
    },
    'ethnicity': {
        'Multi-ethnic': 'Multi-ethnic/Unknown',
        'Unknown': 'Multi-ethnic/Unknown'        
    },
    'type_of_admission': {
        'Elective': 'Elective/Trauma/Other',
        'Trauma': 'Elective/Trauma/Other',
        'Not Available': 'Elective/Trauma/Other'
    },
    'payment_typology_1': {
        'Federal/State/Local/VA': 'Miscellaneous/Other',
        'Department of Corrections': 'Miscellaneous/Other'
    },
    'payment_typology_3': {
        'Medicare': 'Miscellaneous/Other',
        'Federal/State/Local/VA': 'Miscellaneous/Other',
        'Managed Care, Unspecified': 'Miscellaneous/Other'
    }
}

df = df_clean.copy()

category_mappings['hospital_county'] = create_small_group_mapping(df, 'hospital_county', min_count = 20)
category_mappings['permanent_facility_id'] = create_small_group_mapping(df, 'permanent_facility_id', min_count = 20)
category_mappings['zip_code'] = create_small_group_mapping(df, 'zip_code', min_count = 20)
category_mappings['patient_disposition'] = create_small_group_mapping(df, 'patient_disposition', min_count = 20)
category_mappings['ccsr_procedure_description'] = create_small_group_mapping(df, 'ccsr_procedure_description', min_count = 20)
category_mappings['apr_drg_description'] = create_small_group_mapping(df, 'apr_drg_description', min_count = 20)

In [9]:
# apply all mappings at once
for feature, mapping in category_mappings.items():
    df[feature] = df[feature].replace(mapping)

In [10]:
# split into 80/10/10
X = df.drop(columns = ['length_of_stay', 'log_length_of_stay'])
y = df['log_length_of_stay']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size = 0.1, random_state = 42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size = 0.1111111, random_state = 42  # 0.1 / 0.9 = 0.1111111
)

len(X_train), len(X_val), len(X_test)

(5769, 722, 722)

In [11]:
# target encoding

# initialize; target_type = 'continuous' for regression; smooth ='auto' handles unseen categories and adds regularization
encoder = TargetEncoder(target_type = 'continuous', smooth = 'auto', random_state = 42)

# fit on training data only
X_train_encoded = encoder.fit_transform(X_train, y_train)

# transform validation and test (handles unseen categories automatically)
X_val_encoded = encoder.transform(X_val)
X_test_encoded = encoder.transform(X_test)

# convert back to DataFrames
X_train_encoded = pd.DataFrame(X_train_encoded, columns = X_train.columns, index = X_train.index)
X_val_encoded = pd.DataFrame(X_val_encoded, columns = X_val.columns, index = X_val.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns = X_test.columns, index = X_test.index)

In [12]:
"""
# lasso regression - which features can be removed?
# standardize predictors
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_val_scaled = scaler.transform(X_val_encoded)

# lasso model
lasso = Lasso(alpha = 0.001, max_iter = 10000)
lasso.fit(X_train_scaled, y_train)

coef_df = pd.DataFrame({
    "feature": X_train_encoded.columns,
    "coefficient": lasso.coef_
})

# features removed (coefficient exactly 0)
removed_features = coef_df[coef_df["coefficient"] == 0]["feature"].tolist()
print("Features removed by Lasso (coef = 0):")
print(removed_features)

# features retained
retained_features = coef_df[coef_df["coefficient"] != 0]["feature"].tolist()
print(f"\nNumber of retained features: {len(retained_features)}")
print("Some retained features:")
print(retained_features)
"""

'\n# lasso regression - which features can be removed?\n# standardize predictors\nscaler = StandardScaler()\nX_train_scaled = scaler.fit_transform(X_train_encoded)\nX_val_scaled = scaler.transform(X_val_encoded)\n\n# lasso model\nlasso = Lasso(alpha = 0.001, max_iter = 10000)\nlasso.fit(X_train_scaled, y_train)\n\ncoef_df = pd.DataFrame({\n    "feature": X_train_encoded.columns,\n    "coefficient": lasso.coef_\n})\n\n# features removed (coefficient exactly 0)\nremoved_features = coef_df[coef_df["coefficient"] == 0]["feature"].tolist()\nprint("Features removed by Lasso (coef = 0):")\nprint(removed_features)\n\n# features retained\nretained_features = coef_df[coef_df["coefficient"] != 0]["feature"].tolist()\nprint(f"\nNumber of retained features: {len(retained_features)}")\nprint("Some retained features:")\nprint(retained_features)\n'

In [13]:
"""
# subset training and validation sets to retained features
X_train_reduced = X_train_encoded[retained_features]
X_val_reduced = X_val_encoded[retained_features]

# fit OLS on log-transformed target
ols_reduced = LinearRegression()
ols_reduced.fit(X_train_reduced, y_train)

# predict on validation set (log scale)
y_val_pred_log = ols_reduced.predict(X_val_reduced)

# back-transform to original scale
y_val_pred = np.exp(y_val_pred_log)
y_val_orig = np.exp(y_val)

# evaluate
rmse = np.sqrt(mean_squared_error(y_val_orig, y_val_pred))
mae = mean_absolute_error(y_val_orig, y_val_pred)
r2 = r2_score(y_val_orig, y_val_pred)

print("OLS on lasso-selected features (log scale, evaluated on original scale):")
print(f"RMSE: {rmse:.3f}, MAE: {mae:.3f}, R2: {r2:.3f}")
"""

'\n# subset training and validation sets to retained features\nX_train_reduced = X_train_encoded[retained_features]\nX_val_reduced = X_val_encoded[retained_features]\n\n# fit OLS on log-transformed target\nols_reduced = LinearRegression()\nols_reduced.fit(X_train_reduced, y_train)\n\n# predict on validation set (log scale)\ny_val_pred_log = ols_reduced.predict(X_val_reduced)\n\n# back-transform to original scale\ny_val_pred = np.exp(y_val_pred_log)\ny_val_orig = np.exp(y_val)\n\n# evaluate\nrmse = np.sqrt(mean_squared_error(y_val_orig, y_val_pred))\nmae = mean_absolute_error(y_val_orig, y_val_pred)\nr2 = r2_score(y_val_orig, y_val_pred)\n\nprint("OLS on lasso-selected features (log scale, evaluated on original scale):")\nprint(f"RMSE: {rmse:.3f}, MAE: {mae:.3f}, R2: {r2:.3f}")\n'

In [14]:
# fit the model with standardized features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_val_scaled = scaler.transform(X_val_encoded)

# convert back to DataFrame to keep feature names
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train_encoded.columns, index = X_train_encoded.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns = X_val_encoded.columns, index = X_val_encoded.index)

# refit OLS on standardized data
ols_standardized = LinearRegression()
ols_standardized.fit(X_train_scaled, y_train)

# predict (log scale)
y_val_pred_log_std = ols_standardized.predict(X_val_scaled)

# back-transform to original scale
y_val_pred = np.exp(y_val_pred_log_std)
y_val_orig = np.exp(y_val)

# evaluate (should be very similar performance)
y_val_pred_std = np.exp(y_val_pred_log_std)
rmse_std = np.sqrt(mean_squared_error(y_val_orig, y_val_pred_std))
mae_std = mean_absolute_error(y_val_orig, y_val_pred_std)
r2_std = r2_score(y_val_orig, y_val_pred_std)

print("\nOLS with standardized features (evaluated on original scale):")
print(f"RMSE: {rmse_std:.3f}, MAE: {mae_std:.3f}, R2: {r2_std:.3f}")


OLS with standardized features (evaluated on original scale):
RMSE: 4.631, MAE: 2.149, R2: 0.442


Above is the full clean pipeline. Package the steps and save it to a file.

In [15]:
import pickle
import pandas as pd
import numpy as np

class LOSPipeline:
    def __init__(self, encoder, scaler, model, features, category_mappings = None):
        self.encoder = encoder
        self.scaler = scaler
        self.model = model
        self.features = features
        self.category_mappings = category_mappings or {}
    
    def preprocess(self, df):
        df = df.copy()
        
        # Fill missing values safely
        missing_cols_abortion = ['health_service_area', 'hospital_county', 'operating_certificate_number',
                                'permanent_facility_id', 'zip_code']
        
        if 'facility_name' in df.columns:
            # Fill NA with empty string to safely compare
            facility_name_filled = df['facility_name'].fillna('')
            mask = facility_name_filled == "Redacted for Confidentiality"
            df.loc[mask, missing_cols_abortion] = df.loc[mask, missing_cols_abortion].fillna("NAA")
            df.loc[~mask, 'zip_code'] = df.loc[~mask, 'zip_code'].fillna("NAS")
        else:
            # No facility_name column â€” just fill defaults
            df.loc[:, 'zip_code'] = df.loc[:, 'zip_code'].fillna("NAS")
        
        # Fill other NA columns
        for col in ['payment_typology_2', 'payment_typology_3', 'ccsr_procedure_description']:
            if col in df.columns:
                df[col] = df[col].fillna("None")
        
        # Apply category mappings
        for feature, mapping in self.category_mappings.items():
            if feature in df.columns:
                df[feature] = df[feature].replace(mapping)
        
        return df
    
    def predict(self, df):
        df = self.preprocess(df)
        
        # Ensure all features exist
        for col in self.features:
            if col not in df.columns:
                df[col] = "None"
        
        # Subset to features in correct order
        df_features = df[self.features]
        
        # Target encoding
        df_encoded = self.encoder.transform(df_features)
        
        # Convert back to DataFrame
        df_encoded = pd.DataFrame(df_encoded, columns=self.features)
        
        # Standardize
        df_scaled = self.scaler.transform(df_encoded)
        
        # Convert back to DataFrame
        df_scaled = pd.DataFrame(df_scaled, columns=self.features)
        
        # Predict log-length-of-stay and back-transform
        log_pred = self.model.predict(df_scaled)
        return np.exp(log_pred)

# -------------------------
# save pipeline
# -------------------------
pipeline = LOSPipeline(
    encoder = encoder,
    scaler = scaler,
    model = ols_standardized,
    features = list(encoder.feature_names_in_),  # All 20 features
    category_mappings = category_mappings
)

with open("../app/pipeline_v1.bin", "wb") as f_out:
    pickle.dump(pipeline, f_out)
print("Pipeline saved!")

Pipeline saved!


In [16]:
data_point = {
    'health_service_area': 'New York City',
    'hospital_county': 'New York',
    'operating_certificate_number': '7002032',
    'permanent_facility_id': '001469',
    'facility_name': 'MOUNT SINAI MORNINGSIDE',
    'age_group': '70 or Older',
    'zip_code': '100',
    'gender': 'F',
    'race': 'Other Race',
    'ethnicity': 'Spanish/Hispanic',
    'length_of_stay': 3,
    'type_of_admission': 'Emergency',
    'patient_disposition': 'Home w/ Home Health Services',
    'discharge_year': 2024,
    'ccsr_diagnosis_code': 'RSP003',
    'ccsr_diagnosis_description': 'INFLUENZA',
    'ccsr_procedure_code': pd.NA,
    'ccsr_procedure_description': pd.NA,
    'apr_drg_code': '113',
    'apr_drg_description': 'INFECTIONS OF UPPER RESPIRATORY TRACT',
    'apr_mdc_code': '03',
    'apr_mdc_description': 'EAR, NOSE, MOUTH, THROAT AND CRANIOFACIAL DISEASES',
    'apr_severity_of_illness_code': '3',
    'apr_severity_of_illness_description': 'Major',
    'apr_risk_of_mortality': 'Major',
    'apr_medical_surgical_description': 'Medical',
    'payment_typology_1': 'Medicare',
    'payment_typology_2': 'Medicare',
    'payment_typology_3': pd.NA,
    'birth_weight': pd.NA,
    'emergency_department_indicator': 'Y',
    'total_charges': 52009.26,
    'total_costs': 11007.46
}

In [17]:
# convert to DataFrame
df_test = pd.DataFrame([data_point])

# predict
pred = pipeline.predict(df_test)
print("Predicted length of stay:", pred[0])

Predicted length of stay: 4.360550806716047


In [18]:
data_point2 = {
    'health_service_area': 'New York City',
    'hospital_county': 'New York',
    'operating_certificate_number': '7002032',
    'permanent_facility_id': '001469',
    'facility_name': 'MOUNT SINAI MORNINGSIDE',
    'age_group': '70 or Older',
    'zip_code': '100',
    'gender': 'M',
    'race': 'Other Race',
    'ethnicity': 'Spanish/Hispanic',
    'length_of_stay': 3,
    'type_of_admission': 'Emergency',
    'patient_disposition': 'Home w/ Home Health Services',
    'discharge_year': 2024,
    'ccsr_diagnosis_code': 'RSP003',
    'ccsr_diagnosis_description': 'INFLUENZA',
    'ccsr_procedure_code': pd.NA,
    'ccsr_procedure_description': pd.NA,
    'apr_drg_code': '113',
    'apr_drg_description': 'INFECTIONS OF UPPER RESPIRATORY TRACT',
    'apr_mdc_code': '03',
    'apr_mdc_description': 'EAR, NOSE, MOUTH, THROAT AND CRANIOFACIAL DISEASES',
    'apr_severity_of_illness_code': '3',
    'apr_severity_of_illness_description': 'Major',
    'apr_risk_of_mortality': 'Major',
    'apr_medical_surgical_description': 'Medical',
    'payment_typology_1': 'Medicare',
    'payment_typology_2': 'Medicare',
    'payment_typology_3': pd.NA,
    'birth_weight': pd.NA,
    'emergency_department_indicator': 'Y',
    'total_charges': 52009.26,
    'total_costs': 11007.46
}

In [19]:
# convert to DataFrame
df_test2 = pd.DataFrame([data_point2])

# predict
pred = pipeline.predict(df_test2)
print("Predicted length of stay:", pred[0])

Predicted length of stay: 4.250121581733489
