In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import scipy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# read in RAND dataset
file = '../../../../gtmsa_practicum_datasets/rand_hcris_cy_hosp_a_2020_08_01.csv'
rand = pd.read_csv(file, encoding='ISO-8859-1',low_memory=False)

rand.rename(columns = {'Medicare provider number [prvdr_num]':'medicare provider number'},inplace=True)

In [3]:
filtered_rand = rand[[i for i in rand.columns.values if rand[i].dtype!='object']]
print(filtered_rand.shape)
filtered_rand = filtered_rand.groupby(['medicare provider number']).mean()
filtered_rand['medicare provider number'] = filtered_rand.index
filtered_rand.reset_index(drop=True,inplace=True)
print(filtered_rand.shape)

(115757, 1191)
(7475, 1191)


In [4]:
# read in detailed_data dataset
main_data = pd.read_csv('../gtmsa_practicum_datasets/detailed_final_data.csv',sep=',')
main_data.columns=main_data.columns.str.replace('\n',' ')
idx = main_data['relative price for inpatient and outpatient services'].isnull()
main_data = main_data[~idx]

In [5]:
cols = ['medicare provider number','relative price for inpatient and outpatient services']
merged_data = pd.merge(main_data[cols],filtered_rand,how='inner')

In [6]:
abs_corr_coef = np.zeros(merged_data.shape[1])
corr_coef = np.zeros(merged_data.shape[1])
for index,i in enumerate(merged_data.columns.values):
    if(i not in cols):
        if(np.sum((merged_data.iloc[:,index].isnull()))<400): #To exclude columns that had more than 400 NAN out of 726\n",
            corr_coef[index] = (merged_data[i].corr(merged_data['relative price for inpatient and outpatient services']))
            abs_corr_coef[index] = abs(corr_coef[index])
corr_coef = np.nan_to_num(corr_coef,nan=0)
abs_corr_coef = np.nan_to_num(abs_corr_coef,nan=0)

In [7]:
rank = np.argsort(abs_corr_coef)
rank = rank[::-1]
print(rank)
print(np.sort(abs_corr_coef)[::-1])
print(rank[::-1])
print(np.sort(abs_corr_coef)[::-1])
print(np.sort(abs_corr_coef)[::-1][:30])

[1159  836  798 ...  654  653    0]
[0.29315973 0.2753966  0.25549565 ... 0.         0.         0.        ]
[   0  653  654 ...  798  836 1159]
[0.29315973 0.2753966  0.25549565 ... 0.         0.         0.        ]
[0.29315973 0.2753966  0.25549565 0.24787882 0.24781429 0.23320999
 0.21788403 0.20631478 0.20011599 0.19932239 0.19874098 0.19841476
 0.19717729 0.19280343 0.19002009 0.18509575 0.18307256 0.18202043
 0.17324757 0.1689091  0.16861938 0.16628637 0.16542381 0.16242966
 0.16127567 0.15909582 0.15773177 0.15386933 0.15297601 0.15250968]


In [8]:
final_cols = list(merged_data.columns[rank[::-1][:30]])
final_cols.append('medicare provider number')
merged_data[final_cols].to_csv('../gtmsa_practicum_datasets/rand_final_data.csv',index=False)