In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import scipy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# read in RAND dataset
file = '../gtmsa_practicum_datasets/rand_hcris_cy_hosp_a_2020_08_01.csv'
rand = pd.read_csv(file, encoding='ISO-8859-1',low_memory=False)

rand.rename(columns = {'Medicare provider number [prvdr_num]':'Medicare Provider Number'},inplace=True)

In [4]:
filtered_rand = rand[[i for i in rand.columns.values if rand[i].dtype!='object']]
print(filtered_rand.shape)
filtered_rand = filtered_rand.groupby(['Medicare Provider Number']).mean()
filtered_rand['Medicare Provider Number'] = filtered_rand.index
filtered_rand.reset_index(drop=True,inplace=True)
print(filtered_rand.shape)

(115757, 1191)
(7475, 1191)


In [5]:
# read in detailed_data dataset
main_data = pd.read_excel('../gtmsa_practicum_datasets/Detailed_Data.xlsx',
                        sheet_name='Table 1. Hospitals',skiprows=16)
main_data.columns=main_data.columns.str.replace('\n',' ')
idx = main_data['Relative Price for Inpatient and Outpatient Services'].isnull()
main_data = main_data[~idx]

In [6]:
merged_data = pd.merge(main_data[['Medicare Provider Number','Relative Price for Inpatient and Outpatient Services']],filtered_rand,how='inner')
print(merged_data.shape)

(726, 1192)


In [10]:
abs_corr_coef = np.zeros(merged_data.shape[1])
corr_coef = np.zeros(merged_data.shape[1])
cols = ['Medicare Provider Number','Relative Price for Inpatient and Outpatient Services']
for index,i in enumerate(merged_data.columns.values):
    if(i not in cols):
        if(np.sum((merged_data.iloc[:,index].isnull()))<400): #To exclude columns that had more than 400 NAN out of 726
            corr_coef[index] = (merged_data[i].corr(merged_data['Relative Price for Inpatient and Outpatient Services']))
            abs_corr_coef[index] = abs(corr_coef[index])
corr_coef = np.nan_to_num(corr_coef,nan=0)
abs_corr_coef = np.nan_to_num(abs_corr_coef,nan=0)

In [16]:
rank = np.argsort(abs_corr_coef)
# rank from high to low
rank = rank[::-1]
print(rank)
print(np.sort(abs_corr_coef)[::-1])

[ 836 1150  834 ...  617  616    0]
[0.39364046 0.39111401 0.33103277 ... 0.         0.         0.        ]


In [25]:
feature_select_ind = rank[:20]
print(feature_select_ind)
final_data = merged_data.iloc[:,feature_select_ind]

[ 836 1150  834 1159  833  829  707  796  611  798  837  818  762 1149
  607  608 1112  835 1111  711]
