In [1]:
import numpy as np
import pandas as pd

In [2]:
# read in IPPS dataset from 2015-2017
def load_IPPS_Provider_Data(p):
    files = ['Medicare_Provider_Charge_Inpatient_DRGALL_FY2015.csv',
             'Medicare_Provider_Charge_Inpatient_DRGALL_FY2016.csv',
             'Medicare_Provider_Charge_Inpatient_DRGALL_FY2017.csv']
    df = pd.DataFrame([])
    for index,file in enumerate(files):
        db = pd.read_csv(p+file,dtype=str)
        db['year'] = 2015+index
        db.columns=db.columns.str.replace('\n',' ') 
        df = df.append(db)
    df['Provider Id'] = df['Provider Id'].apply(lambda x: x.zfill(6))
    cols = ['Total Discharges', 'Average Covered Charges',
       'Average Total Payments', 'Average Medicare Payments']
    
    for col in cols:
        df[col] = df[col].replace('[\$,]', '', regex=True).astype(float)
        df[col] = pd.to_numeric(df[col])
    return df
path2 = '../../../../gtmsa_practicum_datasets/Medicare_Provider_Charge_Inpatient_DRG/'
ipps = load_IPPS_Provider_Data(path2)

In [3]:
def convert_ipps_grouped_data(dt):
    
    cols = ['Provider Id' ,'year','Total Discharges', 'Average Covered Charges',
       'Average Total Payments', 'Average Medicare Payments']
    dt = dt[cols].groupby(by=['Provider Id' ,'year']).mean()
    dt = dt.unstack('year')
    dt['medicare provider number'] = dt.index
    return dt
data = convert_ipps_grouped_data(ipps)

In [4]:
data.to_csv('../gtmsa_practicum_datasets/IPPS_final_data.csv',index=False)

In [5]:
ipps.columns.values

array(['DRG Definition', 'Provider Id', 'Provider Name',
       'Provider Street Address', 'Provider City', 'Provider State',
       'Provider Zip Code', 'Hospital Referral Region (HRR) Description',
       'Total Discharges', 'Average Covered Charges',
       'Average Total Payments', 'Average Medicare Payments', 'year'],
      dtype=object)

In [6]:
# IPPS data manipulation # convert to integer
ipps['Average Total Payments'] = ipps['Average Total Payments'].replace('[\$,]', '', regex=True).astype(float)
ipps['Total Discharges'] = ipps['Total Discharges'].replace(',', '', regex=True).astype(float)



In [7]:
Provide_name_counts =  ipps['Provider Name'].value_counts()
idx = Provide_name_counts > 100
print(np.mean(idx))
Provide_name_counts[idx]

0.5253348864298194


GOOD SAMARITAN HOSPITAL                1563
METHODIST HOSPITAL                     1436
ST JOSEPH MEDICAL CENTER               1263
FLORIDA HOSPITAL                       1202
COVENANT MEDICAL CENTER                1158
                                       ... 
LECONTE MEDICAL CENTER                  101
CAPITAL MEDICAL CENTER                  101
LEE'S SUMMIT MEDICAL CENTER             101
BANNER DEL E WEBB MEDICAL CENTER        101
SAN ANGELO COMMUNITY MEDICAL CENTER     101
Name: Provider Name, Length: 1804, dtype: int64

In [8]:
#investigate categorical features
#633
ipps['DRG Definition'].nunique()
#3434
ipps['Provider Name'].nunique()
#307
ipps['Hospital Referral Region (HRR) Description'].nunique()

307

In [9]:
#drop categeorcal features that have too many features
ipps=ipps.drop(columns=['DRG Definition','Provider Name','Provider Street Address',
                       'Provider City','Provider State','Provider Zip Code','Hospital Referral Region (HRR) Description',
                       ])





In [10]:
ipps

Unnamed: 0,Provider Id,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments,year
0,010033,11.0,1.014783e+06,171081.180000,141193.910000,2015
1,030103,28.0,3.820510e+05,194081.390000,167511.960000,2015
2,050025,21.0,9.718815e+05,324050.710000,294419.620000,2015
3,050100,18.0,1.517858e+06,257177.330000,233532.830000,2015
4,050108,17.0,1.470341e+06,346150.120000,338472.000000,2015
...,...,...,...,...,...,...
196320,670120,14.0,5.052814e+04,4924.071429,3647.500000,2017
196321,670120,23.0,4.739665e+04,4251.260870,3450.217391,2017
196322,670120,22.0,1.473422e+05,18504.954550,13261.090910,2017
196323,670122,25.0,1.181230e+05,11911.400000,11016.520000,2017


In [11]:
ipps['Average Covered Charges']=pd.to_numeric(ipps['Average Covered Charges'],errors='coerce')
ipps['Average Covered Charges'].dtype
ipps['Average Medicare Payments']=pd.to_numeric(ipps['Average Medicare Payments'],errors='coerce')
ipps['Provider Id']=pd.to_numeric(ipps['Provider Id'],errors='coerce')

In [12]:
#aggregate averge cost of numerical features for 2015-2017
ipps_uniq = ipps.groupby('Provider Id').agg({'Total Discharges':'mean','Average Covered Charges':
                                             'mean','Average Total Payments':'mean','Average Medicare Payments':
                                             'mean'
                                            })
                                             
ipps_uniq

Unnamed: 0_level_0,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments
Provider Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10001,44.413580,47368.595346,10036.619396,8804.755886
10005,38.671111,18247.278187,7384.554970,6097.033073
10006,40.125698,46338.491969,9535.423762,8286.482090
10007,21.946429,12564.707055,5494.060902,4594.803892
10008,18.625000,15143.466109,6163.267981,5098.147950
...,...,...,...,...
670112,41.000000,117879.271627,28541.372983,26828.692337
670116,29.000000,145493.983335,25640.225000,18517.209520
670119,16.000000,141627.875000,14726.437500,13357.000000
670120,15.222222,71595.689113,7312.766134,5788.862531


In [13]:
#export selected ipps to excel
ipps_uniq.to_csv("ipps_selected.csv",sep=',', encoding='utf-8')