In [None]:
import pickle
import pandas as pd
import numpy as np

# define the LOSPipeline class
class LOSPipeline:
    def __init__(self, encoder, scaler, model, features, category_mappings = None):
        self.encoder = encoder
        self.scaler = scaler
        self.model = model
        self.features = features
        self.category_mappings = category_mappings or {}
    
    def preprocess(self, df):
        df = df.copy()
        
        # fill missing values safely
        missing_cols_abortion = ['health_service_area', 'hospital_county', 'operating_certificate_number',
                                 'permanent_facility_id', 'zip_code']
        
        if 'facility_name' in df.columns:
            facility_name_filled = df['facility_name'].fillna('')
            mask = facility_name_filled == "Redacted for Confidentiality"
            df.loc[mask, missing_cols_abortion] = df.loc[mask, missing_cols_abortion].fillna("NAA")
            df.loc[~mask, 'zip_code'] = df.loc[~mask, 'zip_code'].fillna("NAS")
        else:
            df.loc[:, 'zip_code'] = df.loc[:, 'zip_code'].fillna("NAS")
        
        for col in ['payment_typology_2', 'payment_typology_3', 'ccsr_procedure_description']:
            if col in df.columns:
                df[col] = df[col].fillna("None")
        
        for feature, mapping in self.category_mappings.items():
            if feature in df.columns:
                df[feature] = df[feature].replace(mapping)
        
        return df
    
    def predict(self, df):
        df = self.preprocess(df)
        
        for col in self.features:
            if col not in df.columns:
                df[col] = "None"
        
        df_features = df[self.features]
        df_encoded = self.encoder.transform(df_features)
        df_encoded = pd.DataFrame(df_encoded, columns=self.features)
        df_scaled = self.scaler.transform(df_encoded)
        df_scaled = pd.DataFrame(df_scaled, columns=self.features)
        log_pred = self.model.predict(df_scaled)
        return np.exp(log_pred)

# now load the pipeline
with open("../app/pipeline_v1.bin", "rb") as f_in:
    pipeline = pickle.load(f_in)

In [None]:
data_point = {
    'health_service_area': 'New York City',
    'hospital_county': 'New York',
    'operating_certificate_number': '7002032',
    'permanent_facility_id': '001469',
    'facility_name': 'MOUNT SINAI MORNINGSIDE',
    'age_group': '70 or Older',
    'zip_code': '100',
    'gender': 'F',
    'race': 'Other Race',
    'ethnicity': 'Spanish/Hispanic',
    'length_of_stay': 3,
    'type_of_admission': 'Emergency',
    'patient_disposition': 'Home w/ Home Health Services',
    'discharge_year': 2024,
    'ccsr_diagnosis_code': 'RSP003',
    'ccsr_diagnosis_description': 'INFLUENZA',
    'ccsr_procedure_code': pd.NA,
    'ccsr_procedure_description': pd.NA,
    'apr_drg_code': '113',
    'apr_drg_description': 'INFECTIONS OF UPPER RESPIRATORY TRACT',
    'apr_mdc_code': '03',
    'apr_mdc_description': 'EAR, NOSE, MOUTH, THROAT AND CRANIOFACIAL DISEASES',
    'apr_severity_of_illness_code': '3',
    'apr_severity_of_illness_description': 'Major',
    'apr_risk_of_mortality': 'Major',
    'apr_medical_surgical_description': 'Medical',
    'payment_typology_1': 'Medicare',
    'payment_typology_2': 'Medicare',
    'payment_typology_3': pd.NA,
    'birth_weight': pd.NA,
    'emergency_department_indicator': 'Y',
    'total_charges': 52009.26,
    'total_costs': 11007.46
}

# convert to DataFrame and predict
df_test = pd.DataFrame([data_point])
pred = pipeline.predict(df_test)
print("Predicted length of stay:", pred[0])

Predicted length of stay: 4.360550806716047
