In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from math import sqrt

In [2]:
data = pd.read_csv('../gtmsa_practicum_datasets/merged_final_data.csv',dtype=str)
# rearrange column (put reponse colkumn to the end)
data = data[[c for c in data if c not in ['relative price for inpatient and outpatient services']] 
       + ['relative price for inpatient and outpatient services']]
# remove % at the end of these 2 columns
data['relative price for inpatient facility services'] = data['relative price for inpatient facility services'].str.rstrip('%').astype('float')
data['relative price for outpatient facility services'] = data['relative price for outpatient facility services'].str.rstrip('%').astype('float')
# convert all to numeric
data = data.apply(pd.to_numeric, errors='ignore')
# convert object data type column to dummy variables
labelencoder = LabelEncoder()
cat_features=[x for x in data.columns if data[x].dtype=="object"]
for col in cat_features:
    if col in data.columns:
        i = data.columns.get_loc(col)
        data.iloc[:,i] = data.apply(lambda i:labelencoder.fit_transform(i.astype(str)), axis=0, result_type='expand')

print(data.shape)


(3329, 87)


In [3]:
# filter to only labeled data
idx = data['relative price for inpatient and outpatient services'].isnull()
dt = data[~idx].reset_index(drop=True)
# fill NaN with imputation
dt.fillna(method="ffill", inplace=True)
dt.fillna(method="bfill", inplace=True)
print(dt.shape)
dt.head()

(1628, 87)


Unnamed: 0,medicare provider number,hospital name,street address,city,state,zip code,"hospital system or, if independent, ipps/cah",is hospital a critical access hospital (y/n)?,"hospital compare 5-star rating (october 2018, na=not available)",number of outpatient services,...,Revenue from rental of living quarters [other_income_livingrental],Revenue from laundry and linen service [other_income_laundry],Parking lot receipts [other_income_parking],Rebates and refunds of expenses [other_income_rebates],Nursing and allied health managed care payment [mdcr_ipps_nurs_mgdcare_pymts],Net organ acquisition cost [mdcr_ipps_net_organ_cost],Cost of teaching physicians [mdcr_ipps_teach_phys_cost],Routine service other pass through costs [mdcr_ipps_routine_passthru_cost],medicare provider number.1,relative price for inpatient and outpatient services
0,10006,732,1231,587,1,35631,289,0,3.0,118.0,...,759030.638055,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,241.0
1,10029,704,1177,1330,1,36801,152,0,4.0,603.0,...,759030.638055,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0
2,10033,3003,2643,152,1,35233,540,0,2.0,345.0,...,759030.638055,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,322.0
3,10039,1124,163,817,1,35801,244,0,2.0,1091.0,...,759030.638055,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,245.0
4,10056,2587,3003,152,1,35205,23,0,4.0,107.0,...,759030.638055,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0


In [4]:
# split dataset to x and y
random_state = 100
x_data = dt.loc[:, dt.columns != "relative price for inpatient and outpatient services"]
y_data = dt.loc[:, "relative price for inpatient and outpatient services"]

# Linear Regression

In [5]:
#split dataset
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, shuffle=True, random_state=random_state)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1221, 86), (407, 86), (1221,), (407,))

In [6]:
# train linear regression model
lin_regr = LinearRegression()
lin_regr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
# testing in-sample
y_pred_train = lin_regr.predict(x_train)
print(sqrt(mean_squared_error(y_train, y_pred_train)))

18.533430187428763


In [8]:
# testing out-of-sample
y_pred_test = lin_regr.predict(x_test)
print(sqrt(mean_squared_error(y_test, y_pred_test)))

22.781566031047852


In [9]:
# merge in-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_train, columns = ['y_pred'], index = x_train.index.copy())
df_out_lin_regr_train = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_lin_regr_train)
df_out_lin_regr_train.shape

(1628, 88)

In [10]:
# merge out-of-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_test, columns = ['y_pred'], index = x_test.index.copy())
df_out_lin_regr_test = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)

#display(df_out_lin_regr_test)
df_out_lin_regr_test.shape

(1628, 88)

In [11]:
# combine
final_df_out = df_out_lin_regr_train['y_pred'].combine_first(df_out_lin_regr_test['y_pred'])
y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = dt.index.copy())
final_df_out = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
display(final_df_out.head())

Unnamed: 0,medicare provider number,hospital name,street address,city,state,zip code,"hospital system or, if independent, ipps/cah",is hospital a critical access hospital (y/n)?,"hospital compare 5-star rating (october 2018, na=not available)",number of outpatient services,...,Revenue from laundry and linen service [other_income_laundry],Parking lot receipts [other_income_parking],Rebates and refunds of expenses [other_income_rebates],Nursing and allied health managed care payment [mdcr_ipps_nurs_mgdcare_pymts],Net organ acquisition cost [mdcr_ipps_net_organ_cost],Cost of teaching physicians [mdcr_ipps_teach_phys_cost],Routine service other pass through costs [mdcr_ipps_routine_passthru_cost],medicare provider number.1,relative price for inpatient and outpatient services,y_pred
0,10006,732,1231,587,1,35631,289,0,3.0,118.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,241.0,235.614781
1,10029,704,1177,1330,1,36801,152,0,4.0,603.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0,197.357568
2,10033,3003,2643,152,1,35233,540,0,2.0,345.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,322.0,313.088106
3,10039,1124,163,817,1,35801,244,0,2.0,1091.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,245.0,244.016399
4,10056,2587,3003,152,1,35205,23,0,4.0,107.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0,191.943683


# Random Forest Regression

In [12]:
regr = RandomForestRegressor(max_depth=None, random_state= random_state, n_estimators=100)
regr.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=100, verbose=0,
                      warm_start=False)

In [13]:
y_pred = regr.predict(x_train)
rmse_random_forest_train = sqrt(mean_squared_error(y_train, y_pred))
print(rmse_random_forest_train) 

7.1128840488383815


In [14]:
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_train.index.copy())
df_out_random_forest_train = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_random_forest_train)

In [15]:
y_pred = regr.predict(x_test)
rmse_random_forest_test = sqrt(mean_squared_error(y_test, y_pred))
print(rmse_random_forest_test)

23.671516680719932


In [16]:
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_test.index.copy())
df_out_random_forest_test = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_random_forest_test)

In [17]:
y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = dt.index.copy())
final_df_out = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
display(final_df_out.head())

Unnamed: 0,medicare provider number,hospital name,street address,city,state,zip code,"hospital system or, if independent, ipps/cah",is hospital a critical access hospital (y/n)?,"hospital compare 5-star rating (october 2018, na=not available)",number of outpatient services,...,Revenue from laundry and linen service [other_income_laundry],Parking lot receipts [other_income_parking],Rebates and refunds of expenses [other_income_rebates],Nursing and allied health managed care payment [mdcr_ipps_nurs_mgdcare_pymts],Net organ acquisition cost [mdcr_ipps_net_organ_cost],Cost of teaching physicians [mdcr_ipps_teach_phys_cost],Routine service other pass through costs [mdcr_ipps_routine_passthru_cost],medicare provider number.1,relative price for inpatient and outpatient services,y_pred
0,10006,732,1231,587,1,35631,289,0,3.0,118.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,241.0,235.614781
1,10029,704,1177,1330,1,36801,152,0,4.0,603.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0,197.357568
2,10033,3003,2643,152,1,35233,540,0,2.0,345.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,322.0,313.088106
3,10039,1124,163,817,1,35801,244,0,2.0,1091.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,245.0,244.016399
4,10056,2587,3003,152,1,35205,23,0,4.0,107.0,...,47940.640042,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0,191.943683


# Classification (knn)

In [35]:
# bucket reponse variable into different classes
bucket = np.array([0, 100, 400, np.inf])
dt['cost_category'] = pd.cut(dt['relative price for inpatient and outpatient services'], bucket, labels = ["low", "medium", "high"])
#dt.select_dtypes(exclude=["number"])

In [36]:
# split dataset for train and test
x = dt.loc[:, dt.columns != "cost_category"]
y_data = dt.loc[:, "cost_category"]
# factorize y
y = labelencoder.fit_transform(y_data)

# split dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=random_state)

In [37]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [38]:
y_pred_train = knn.predict(x_train)
y_pred_test = knn.predict(x_test)

In [39]:
print('Confusion matrix\n' ,confusion_matrix(y_test, y_pred_test))
print('Accuracy', accuracy_score(y_test, y_pred_test))
print('Precision', precision_score(y_test, y_pred_test, average='weighted'))
print('f1-score', f1_score(y_test, y_pred_test, average='weighted'))

Confusion matrix
 [[  0   0  13]
 [  0   0   3]
 [  0   2 389]]
Accuracy 0.9557739557739557
Precision 0.9227348560681894
f1-score 0.9389638610744138


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [40]:
# convert back to original label coding
y_pred_train = labelencoder.inverse_transform(y_pred_train)
y_pred_test = labelencoder.inverse_transform(y_pred_test)

In [41]:
# merge out-of-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_test, columns = ['y_pred'], index = x_test.index.copy())
df_out_knn_test = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_knn_test)


In [42]:
# merge in-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_train, columns = ['y_pred'], index = x_train.index.copy())
df_out_knn_train = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_knn_train)

In [43]:
# combine
final_df_out = df_out_knn_train['y_pred'].combine_first(df_out_knn_test['y_pred'])
y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = dt.index.copy())
final_df_out = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
display(final_df_out.head())

Unnamed: 0,medicare provider number,hospital name,street address,city,state,zip code,"hospital system or, if independent, ipps/cah",is hospital a critical access hospital (y/n)?,"hospital compare 5-star rating (october 2018, na=not available)",number of outpatient services,...,Parking lot receipts [other_income_parking],Rebates and refunds of expenses [other_income_rebates],Nursing and allied health managed care payment [mdcr_ipps_nurs_mgdcare_pymts],Net organ acquisition cost [mdcr_ipps_net_organ_cost],Cost of teaching physicians [mdcr_ipps_teach_phys_cost],Routine service other pass through costs [mdcr_ipps_routine_passthru_cost],medicare provider number.1,relative price for inpatient and outpatient services,cost_category,y_pred
0,10006,732,1231,587,1,35631,289,0,3.0,118.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,241.0,medium,medium
1,10029,704,1177,1330,1,36801,152,0,4.0,603.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0,medium,medium
2,10033,3003,2643,152,1,35233,540,0,2.0,345.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,322.0,medium,medium
3,10039,1124,163,817,1,35801,244,0,2.0,1091.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,245.0,medium,medium
4,10056,2587,3003,152,1,35205,23,0,4.0,107.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0,medium,medium


# Classification (Random Forest)

In [44]:
rf_class = RandomForestClassifier(max_depth=None, random_state= random_state, n_estimators=100)
rf_class.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=100,
                       verbose=0, warm_start=False)

In [45]:
y_pred_train = rf_class.predict(x_train)
y_pred_test = rf_class.predict(x_test)

In [46]:
print('Confusion matrix\n' ,confusion_matrix(y_test, y_pred_test))
print('Accuracy', accuracy_score(y_test, y_pred_test))
print('Precision', precision_score(y_test, y_pred_test, average='weighted'))
print('f1-score', f1_score(y_test, y_pred_test, average='weighted'))

Confusion matrix
 [[ 13   0   0]
 [  0   1   2]
 [  0   0 391]]
Accuracy 0.995085995085995
Precision 0.9951110027445905
f1-score 0.9938637617209045


In [47]:
# convert back to original label coding
y_pred_train = labelencoder.inverse_transform(y_pred_train)
y_pred_test = labelencoder.inverse_transform(y_pred_test)

In [48]:
# merge out-of-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_test, columns = ['y_pred'], index = x_test.index.copy())
df_out_rf_class_test = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_rf_class_test)


In [49]:
# merge in-sample predicted value to original dataset
y_hats_df = pd.DataFrame(data = y_pred_train, columns = ['y_pred'], index = x_train.index.copy())
df_out_rf_class_train = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
#display(df_out_rf_class_train)

In [50]:
# combine
final_df_out = df_out_rf_class_train['y_pred'].combine_first(df_out_rf_class_test['y_pred'])
y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = dt.index.copy())
final_df_out = pd.merge(dt, y_hats_df, how = 'left', left_index = True, right_index = True)
display(final_df_out.head())

Unnamed: 0,medicare provider number,hospital name,street address,city,state,zip code,"hospital system or, if independent, ipps/cah",is hospital a critical access hospital (y/n)?,"hospital compare 5-star rating (october 2018, na=not available)",number of outpatient services,...,Parking lot receipts [other_income_parking],Rebates and refunds of expenses [other_income_rebates],Nursing and allied health managed care payment [mdcr_ipps_nurs_mgdcare_pymts],Net organ acquisition cost [mdcr_ipps_net_organ_cost],Cost of teaching physicians [mdcr_ipps_teach_phys_cost],Routine service other pass through costs [mdcr_ipps_routine_passthru_cost],medicare provider number.1,relative price for inpatient and outpatient services,cost_category,y_pred
0,10006,732,1231,587,1,35631,289,0,3.0,118.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,241.0,medium,medium
1,10029,704,1177,1330,1,36801,152,0,4.0,603.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0,medium,medium
2,10033,3003,2643,152,1,35233,540,0,2.0,345.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,322.0,medium,medium
3,10039,1124,163,817,1,35801,244,0,2.0,1091.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,245.0,medium,medium
4,10056,2587,3003,152,1,35205,23,0,4.0,107.0,...,538219.675038,144220.931231,161604.634219,7292181.0,52477.976199,293090.969362,100002.0,174.0,medium,medium
