In [1]:
from sklearn import preprocessing
import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np
import seaborn as sns
import pickle, dill


In [2]:
with open('extratrees_regressor.pk','rb') as file:
    extratrees_model, knn_imputer, scalers, cols_scaled, cols_knn_imp = pickle.load(file)

In [3]:
with open('classifier.pk','rb') as file:
    classifier, knn_imputer, scalers, cols_scaled, cols_knn_imp, labelencoder = dill.load(file)

In [4]:
def feat_prep(knn_imputer, scalers, cols_scaled, cols_knn_imp):
    #Read the data
    df = pd.read_csv('../gtmsa_practicum_datasets/merged_final_data.csv',dtype=str)
    idx = df['relative price for inpatient and outpatient services'].isnull()
    df = df[idx].reset_index(drop=True)
    print(df.shape)
    #convert 2 cols Hospital system if independent and if is critical access hospital to one hot encoding
    df['HS Independent ipps'] = 0
    df['HS Independent cah'] = 0
    df['HS critical'] = 0
    idx = df['hospital system or, if independent, ipps/cah'] == 'Independent (IPPS)'
    df.loc[idx,'HS Independent ipps'] = 1
    idx = df['hospital system or, if independent, ipps/cah'] == 'Independent (CAH)'
    df.loc[idx,'HS Independent cah'] = 1
    df['HS critical'] = df['is hospital a critical access hospital (y/n)?'].map({'Y': 1, 'N': 0})
    
    #Dropping cols that seems irrelavant for prediction and also columns with more than 35% of missing values
    cols = ['hospital name','street address','city','state','zip code','hospital system or, if independent, ipps/cah','is hospital a critical access hospital (y/n)?']
    df.drop(columns=cols,inplace=True)
    #Removing % in column
    
    df['relative price for inpatient facility services'] = df['relative price for inpatient facility services'].str.replace('%','',regex=False)
    df['relative price for outpatient facility services'] = df['relative price for outpatient facility services'].str.replace('%','',regex=False)
    print(df.shape)
    cols_not_scaled = [x for x in df.columns.values if x not in cols_scaled]
    
    np.random.seed(101)
    dt = pd.DataFrame(df[cols_scaled])
    for col,scaler in zip(cols_scaled,scalers):
        dt[col] = pd.to_numeric(dt[col])
        dt[col] = scaler.transform(dt[col].values.reshape(-1,1))
    dt.fillna(np.nan,inplace=True)      
    dt = pd.DataFrame(knn_imputer.transform(dt),columns=dt.columns.values)
    #Adding not scaled and not imputed columns to the final dataframe as float
    for i in cols_not_scaled:
        dt[i] = pd.to_numeric(df[i])
        
    #Have medicare provider number as the index for the data instead bein one feature. 
    dt.set_index('medicare provider number',inplace=True)
    return dt
data = feat_prep(knn_imputer, scalers, cols_scaled, cols_knn_imp)
print(data.shape)

(1701, 87)
(1701, 83)
(1701, 82)


In [5]:
cols = ['hospital compare 5-star rating (october 2018, na=not available)',
       'number of outpatient services',
       'total private allowed amount for outpatient services ($ millions)',
       'simulated medicare allowed amount for outpatient services ($ millions)',
       'relative price for outpatient services',
       'standardized price per outpatient service',
       'number of inpatient stays',
       'simulated medicare allowed amount for inpatient services ($ millions)',
       'relative price for inpatient services',
       'standardized price per inpatient stay',
       'total private allowed amount for inpatient and outpatient services ($ millions)',
       'simulated medicare allowed amount for inpatient and outpatient services ($ millions)',
       'relative price for professional inpatient and outpatient services',
       'relative price for inpatient facility services',
       'relative price for outpatient facility services',
       'total private allowed amount for facility inpatient and outpatient services ($ millions)',
       'simulated medicare allowed amount for facility inpatient and outpatient services ($ millions)',
       'simulated medicare allowed amount for professional inpatient and outpatient services ($ millions)',
       'Total Discharges.1', 'Average Covered Charges.1','Average Total Payments.1',
       'Average Medicare Payments', 
       'Average Estimated Total Submitted Charges.2',
       'Average Medicare Allowed Amount.2',
       'Average Medicare Payment Amount.2',
       'Outlier Comprehensive APC Services',
       'Outlier Comprehensive APC Services.1',
       'Outlier Comprehensive APC Services.2',
       'Average Medicare Outlier Amount', 'Average Medicare Outlier Amount.1',
       'Average Medicare Outlier Amount.2', 'HS critical',
       'HS Independent ipps', 'HS Independent cah']
data = data[cols]

In [6]:
X = data.copy()

In [7]:
y_pred = classifier.predict(data)
data['cost_category_labels'] = y_pred
data['cost_category'] = labelencoder.inverse_transform(y_pred)
y_reg_pred = extratrees_model.predict(X)
data['relative price for inpatient and outpatient services'] = y_reg_pred

In [8]:
data

Unnamed: 0_level_0,"hospital compare 5-star rating (october 2018, na=not available)",number of outpatient services,total private allowed amount for outpatient services ($ millions),simulated medicare allowed amount for outpatient services ($ millions),relative price for outpatient services,standardized price per outpatient service,number of inpatient stays,simulated medicare allowed amount for inpatient services ($ millions),relative price for inpatient services,standardized price per inpatient stay,...,Outlier Comprehensive APC Services.2,Average Medicare Outlier Amount,Average Medicare Outlier Amount.1,Average Medicare Outlier Amount.2,HS critical,HS Independent ipps,HS Independent cah,cost_category_labels,cost_category,relative price for inpatient and outpatient services
medicare provider number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0.197698,0.308809,0.306807,0.436937,0.049049,0.012307,0.241617,0.162412,0.291041,0.192580,...,0.089966,0.311784,0.263736,0.084678,0,0,0,2,medium,156.00
10005,0.000000,0.259760,0.144645,0.156156,0.363864,0.231131,0.214590,0.161445,0.349547,0.247875,...,0.078829,0.000000,0.000000,0.082824,0,0,0,2,medium,201.95
10011,0.197698,0.186186,0.198699,0.284785,0.181181,0.057476,0.213526,0.152277,0.339402,0.309549,...,0.540269,0.216511,0.240752,0.526420,0,0,0,2,medium,175.57
10012,0.197698,0.087087,0.110110,0.156156,0.036537,0.013722,0.471410,0.411333,0.271709,0.158422,...,0.296363,0.360876,0.198637,0.300943,0,0,0,2,medium,149.16
10019,0.471471,0.304805,0.289289,0.405405,0.041542,0.007378,0.386512,0.355953,0.085962,0.214086,...,0.298194,0.237208,0.188816,0.289615,0,0,0,2,medium,121.81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670083,0.456894,0.419920,0.289289,0.000000,0.946947,0.728978,0.387467,0.410536,0.469438,0.479555,...,0.927765,0.358811,0.989937,0.987716,0,1,0,2,medium,293.47
670087,0.216717,0.327327,0.289289,0.000000,0.975501,0.841978,0.136699,0.172297,0.668751,0.563804,...,0.314167,0.000000,0.337454,0.327631,0,1,0,2,medium,328.64
670090,0.519207,0.134134,0.000000,0.000000,0.881667,0.584229,0.367734,0.441629,0.404060,0.557154,...,0.000000,0.107453,0.470054,0.000000,0,1,0,2,medium,283.00
670107,0.544294,0.681866,0.561562,0.463964,0.985127,0.892646,0.221284,0.155593,0.494870,0.386364,...,0.000000,0.000000,0.107232,0.000000,0,0,0,2,medium,385.00


In [9]:
data.to_csv('../gtmsa_practicum_datasets/predictions_on_unlabelled_set.csv')