In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from math import sqrt

In [2]:
data = pd.read_csv('../gtmsa_practicum_datasets/merged_final_data.csv',dtype=str)
# rearrange column (put reponse colkumn to the end)
data = data[[c for c in data if c not in ['relative price for inpatient and outpatient services']] 
       + ['relative price for inpatient and outpatient services']]
print(data.shape)


(3329, 87)


In [3]:
# filter to only labeled data
idx = data['relative price for inpatient and outpatient services'].isnull()
dt = data[~idx].reset_index(drop=True)
print(dt.shape)
dt.head()

(1628, 87)


Unnamed: 0,medicare provider number,hospital name,street address,city,state,zip code,"hospital system or, if independent, ipps/cah",is hospital a critical access hospital (y/n)?,"hospital compare 5-star rating (october 2018, na=not available)",number of outpatient services,...,Revenue from rental of living quarters [other_income_livingrental],Revenue from laundry and linen service [other_income_laundry],Parking lot receipts [other_income_parking],Rebates and refunds of expenses [other_income_rebates],Nursing and allied health managed care payment [mdcr_ipps_nurs_mgdcare_pymts],Net organ acquisition cost [mdcr_ipps_net_organ_cost],Cost of teaching physicians [mdcr_ipps_teach_phys_cost],Routine service other pass through costs [mdcr_ipps_routine_passthru_cost],medicare provider number.1,relative price for inpatient and outpatient services
0,10006,Eliza Coffee Memorial Hospital,205 Marengo Street,Florence,AL,35631,Lifepoint Health,N,3,118,...,,,,,,,,,,241
1,10029,East Alabama Medical Center,2000 Pepperell\nParkway,Opelika,AL,36801,East Alabama Medical\nCenter,N,4,603,...,,,,,,,,,,174
2,10033,University Of Alabama Hospital,619 South 19th Street,Birmingham,AL,35233,UAB Health System,N,2,345,...,,,,,,,,,,322
3,10039,Huntsville Hospital,101 Sivley Rd,Huntsville,AL,35801,Huntsville Hospital\nHealth System,N,2,1091,...,,,,,,,,,,245
4,10056,St Vincents Birmingham,810 St Vincent's Drive,Birmingham,AL,35205,Ascension Health,N,4,107,...,,,,,,,,,,174


In [4]:
random_state = 100
x_data = dt.loc[:, dt.columns != "relative price for inpatient and outpatient services"]
y_data = dt.loc[:, "relative price for inpatient and outpatient services"]

# Linear Regression

In [None]:
# convert to numeric for linear regression
random_state = 100
x = x_data.apply(pd.to_numeric, errors='coerce')
y = y_data.apply(pd.to_numeric, errors='coerce')
# fill values with 0
x.fillna(0, inplace=True)
y.fillna(0, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=random_state)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
# train linear regression model
lin_regr = LinearRegression()
lin_regr.fit(x_train, y_train)

In [None]:
# testing in-sample
y_pred_train = lin_regr.predict(x_train)
print(sqrt(mean_squared_error(y_train, y_pred_train)))

In [None]:
# testing out-of-sample
y_pred_test = lin_regr.predict(x_test)
print(sqrt(mean_squared_error(y_test, y_pred_test)))

In [None]:
# merge in-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_train, columns = ['y_pred'], index = x_train.index.copy())
df_out_lin_regr_train = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_lin_regr_train)
df_out_lin_regr_train.shape

In [None]:
# merge out-of-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_test, columns = ['y_pred'], index = x_test.index.copy())
df_out_lin_regr_test = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)

#display(df_out_lin_regr_test)
df_out_lin_regr_test.shape

In [None]:
# combine
final_df_out = df_out_lin_regr_train['y_pred'].combine_first(df_out_lin_regr_test['y_pred'])
y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = dt.index.copy())
final_df_out = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
display(final_df_out)

# Random Forest Regression

In [None]:
regr = RandomForestRegressor(max_depth=None, random_state= random_state, n_estimators=100)
regr.fit(x_train, y_train)

In [None]:
y_pred = regr.predict(x_train)
rmse_random_forest_train = sqrt(mean_squared_error(y_train, y_pred))
print(rmse_random_forest_train) 

In [None]:
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_train.index.copy())
df_out_random_forest_train = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_random_forest_train)

In [None]:
y_pred = regr.predict(x_test)
rmse_random_forest_test = sqrt(mean_squared_error(y_test, y_pred))
print(rmse_random_forest_test)

In [None]:
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_test.index.copy())
df_out_random_forest_test = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_random_forest_test)

In [None]:
y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = dt.index.copy())
final_df_out = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
display(final_df_out)

# Classification

In [5]:
# bucket reponse variable into different classes
dt = dt.apply(pd.to_numeric, errors='ignore')
dt['cost_category'] = pd.cut(dt['relative price for inpatient and outpatient services'], 5, labels = ["very low", "low", "medium", "high","very high"])
# factorize y
dt
#dt.select_dtypes(exclude=["number"])

Unnamed: 0,medicare provider number,hospital name,street address,city,state,zip code,"hospital system or, if independent, ipps/cah",is hospital a critical access hospital (y/n)?,"hospital compare 5-star rating (october 2018, na=not available)",number of outpatient services,...,Revenue from laundry and linen service [other_income_laundry],Parking lot receipts [other_income_parking],Rebates and refunds of expenses [other_income_rebates],Nursing and allied health managed care payment [mdcr_ipps_nurs_mgdcare_pymts],Net organ acquisition cost [mdcr_ipps_net_organ_cost],Cost of teaching physicians [mdcr_ipps_teach_phys_cost],Routine service other pass through costs [mdcr_ipps_routine_passthru_cost],medicare provider number.1,relative price for inpatient and outpatient services,cost_category
0,10006,Eliza Coffee Memorial Hospital,205 Marengo Street,Florence,AL,35631,Lifepoint Health,N,3.0,118,...,,,,,,,,,241,low
1,10029,East Alabama Medical Center,2000 Pepperell\nParkway,Opelika,AL,36801,East Alabama Medical\nCenter,N,4.0,603,...,,,,,,,,,174,very low
2,10033,University Of Alabama Hospital,619 South 19th Street,Birmingham,AL,35233,UAB Health System,N,2.0,345,...,,,,,,,,,322,low
3,10039,Huntsville Hospital,101 Sivley Rd,Huntsville,AL,35801,Huntsville Hospital\nHealth System,N,2.0,1091,...,,,,,,,,,245,low
4,10056,St Vincents Birmingham,810 St Vincent's Drive,Birmingham,AL,35205,Ascension Health,N,4.0,107,...,,,,,,,,,174,very low
5,10083,South Baldwin Regional\nMedical Center,1613 North Mckenzie\nStreet,Foley,AL,36535,Community Health\nSystems,N,3.0,177,...,,,,,,,,,216,low
6,10092,Dch Regional Medical Center,809 University\nBoulevard East,Tuscaloosa,AL,35401,DCH Health System,N,2.0,122,...,,,,,,,,,214,low
7,10104,Grandview Medical Center,3690 Grandview\nParkway,Birmingham,AL,35243,Community Health\nSystems,N,2.0,76,...,,,,,,,,,150,very low
8,10113,Mobile Infirmary Medical\nCenter,5 Mobile Infirmary\nCircle,Mobile,AL,36652,Infirmary Health\nSystem,N,1.0,58,...,,,,,,,,,130,very low
9,10131,Crestwood Medical Center,One Hospital Dr Se,Huntsville,AL,35801,Community Health\nSystems,N,2.0,83,...,,,,,,,,,175,very low


In [6]:
# split dataset for train and test

x_data = dt.loc[:, dt.columns != "cost_category"]
y_data = dt.loc[:, "cost_category"]
# factorize y
x = x_data.apply(pd.to_numeric, errors='coerce')
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y_data)
#y = y_data.apply(pd.to_numeric, errors='coerce')
# fill values with 0
x.fillna(0, inplace=True)
#y.fillna(0, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=random_state)


In [7]:
rf_class = RandomForestClassifier(max_depth=None, random_state= random_state, n_estimators=100)
rf_class.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=100,
                       verbose=0, warm_start=False)

In [8]:
y_pred_train = rf_class.predict(x_train)
y_pred_test = rf_class.predict(x_test)
# convert back to original coding
y_pred_train = labelencoder.inverse_transform(y_pred_train)
y_pred_test = labelencoder.inverse_transform(y_pred_test)



In [10]:
# merge out-of-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_test, columns = ['y_pred'], index = x_test.index.copy())
df_out_lin_regr_test = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_lin_regr_test)


In [11]:
# merge in-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_train, columns = ['y_pred'], index = x_train.index.copy())
df_out_lin_regr_train = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_lin_regr_train)
df_out_lin_regr_train.shape

NameError: name 'y_pred_train' is not defined

In [None]:
# combine
final_df_out = df_out_lin_regr_train['y_pred'].combine_first(df_out_lin_regr_test['y_pred'])
y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = dt.index.copy())
final_df_out = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
display(final_df_out)