In [26]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [72]:
# read in IPPS dataset from 2015-2017
def load_IPPS_Provider_Data(p):
    files = ['Medicare_Provider_Charge_Inpatient_DRGALL_FY2015.csv',
             'Medicare_Provider_Charge_Inpatient_DRGALL_FY2016.csv',
             'Medicare_Provider_Charge_Inpatient_DRGALL_FY2017.csv']
    df = pd.DataFrame([])
    for index,file in enumerate(files):
        db = pd.read_csv(p+file,dtype=str)
        db['year'] = 2015+index
        db.columns=db.columns.str.replace('\n',' ') 
        df = df.append(db)
    return df
path2 = './gtmsa_practicum_datasets/Medicare_Provider_Charge_Inpatient_DRG/'
ipps = load_IPPS_Provider_Data(path2)
ipps

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region (HRR) Description,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments,year
0,001 - HEART TRANSPLANT OR IMPLANT OF HEART ASS...,10033,UNIVERSITY OF ALABAMA HOSPITAL,619 SOUTH 19TH STREET,BIRMINGHAM,AL,35233,AL - Birmingham,11,1014783.45,171081.18,141193.91,2015
1,001 - HEART TRANSPLANT OR IMPLANT OF HEART ASS...,30103,MAYO CLINIC HOSPITAL,5777 EAST MAYO BOULEVARD,PHOENIX,AZ,85054,AZ - Phoenix,28,382051,194081.39,167511.96,2015
2,001 - HEART TRANSPLANT OR IMPLANT OF HEART ASS...,50025,UNIVERSITY OF CALIFORNIA SAN DIEGO MEDICAL CENTER,200 WEST ARBOR DRIVE,SAN DIEGO,CA,92103,CA - San Diego,21,971881.48,324050.71,294419.62,2015
3,001 - HEART TRANSPLANT OR IMPLANT OF HEART ASS...,50100,SHARP MEMORIAL HOSPITAL,7901 FROST ST,SAN DIEGO,CA,92123,CA - San Diego,18,1517857.61,257177.33,233532.83,2015
4,001 - HEART TRANSPLANT OR IMPLANT OF HEART ASS...,50108,"SUTTER MEDICAL CENTER, SACRAMENTO",2825 CAPITOL AVENUE,SACRAMENTO,CA,95816,CA - Sacramento,17,1470341.06,346150.12,338472,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...
196320,683 - RENAL FAILURE W CC,670120,THE HOSPITALS OF PROVIDENCE TRANSMOUNTAIN CAMPUS,2000 TRANSMOUNTAIN RD,EL PASO,TX,79911,TX - El Paso,14,50528.14286,4924.071429,3647.5,2017
196321,690 - KIDNEY & URINARY TRACT INFECTIONS W/O MCC,670120,THE HOSPITALS OF PROVIDENCE TRANSMOUNTAIN CAMPUS,2000 TRANSMOUNTAIN RD,EL PASO,TX,79911,TX - El Paso,23,47396.65217,4251.26087,3450.217391,2017
196322,871 - SEPTICEMIA OR SEVERE SEPSIS W/O MV >96 H...,670120,THE HOSPITALS OF PROVIDENCE TRANSMOUNTAIN CAMPUS,2000 TRANSMOUNTAIN RD,EL PASO,TX,79911,TX - El Paso,22,147342.1818,18504.95455,13261.09091,2017
196323,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,670122,HOUSTON METHODIST THE WOODLANDS HOSPITAL,17201 INTERSTATE 45 SOUTH,THE WOODLANDS,TX,77385,TX - Houston,25,118123,11911.4,11016.52,2017


In [52]:
# IPPS data manipulation # convert to integer
ipps['Average Total Payments'] = ipps['Average Total Payments'].replace('[\$,]', '', regex=True).astype(float)
ipps['Total Discharges'] = ipps['Total Discharges'].replace(',', '', regex=True).astype(float)



In [46]:
#investigate categorical features
#633
ipps['DRG Definition'].nunique()
#3434
ipps['Provider Name'].nunique()
#307
ipps['Hospital Referral Region (HRR) Description'].nunique()

307

In [47]:
#drop categeorcal features that have too many features
ipps=ipps.drop(columns=['DRG Definition','Provider Name','Provider Street Address',
                       'Provider City','Provider State','Provider Zip Code','Hospital Referral Region (HRR) Description',
                       ])





In [48]:
ipps

Unnamed: 0,Provider Id,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments,year
0,10033,11.0,1014783.45,171081.180000,141193.91,2015
1,30103,28.0,382051,194081.390000,167511.96,2015
2,50025,21.0,971881.48,324050.710000,294419.62,2015
3,50100,18.0,1517857.61,257177.330000,233532.83,2015
4,50108,17.0,1470341.06,346150.120000,338472,2015
...,...,...,...,...,...,...
196320,670120,14.0,50528.14286,4924.071429,3647.5,2017
196321,670120,23.0,47396.65217,4251.260870,3450.217391,2017
196322,670120,22.0,147342.1818,18504.954550,13261.09091,2017
196323,670122,25.0,118123,11911.400000,11016.52,2017


In [69]:
ipps['Average Covered Charges']=pd.to_numeric(ipps['Average Covered Charges'],errors='coerce')
ipps['Average Covered Charges'].dtype
ipps['Average Medicare Payments']=pd.to_numeric(ipps['Average Medicare Payments'],errors='coerce')
ipps['Provider Id']=pd.to_numeric(ipps['Provider Id'],errors='coerce')

In [70]:
#aggregate averge cost of numerical features for 2015-2017
ipps_uniq = ipps.groupby('Provider Id').agg({'Total Discharges':'mean','Average Covered Charges':
                                             'mean','Average Total Payments':'mean','Average Medicare Payments':
                                             'mean'
                                            })
                                             
ipps_uniq

Unnamed: 0_level_0,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments
Provider Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10001,44.413580,46542.879396,10036.619396,8745.075506
10005,38.671111,18542.219135,7384.554970,6177.151901
10006,40.125698,45627.314107,9535.423762,8057.019397
10007,21.946429,12823.581219,5494.060902,4558.554809
10008,18.625000,15053.301478,6163.267981,5535.033934
...,...,...,...,...
670112,41.000000,117879.271627,28541.372983,26828.692337
670116,29.000000,145493.983335,25640.225000,18517.209520
670119,16.000000,141627.875000,14726.437500,13357.000000
670120,15.222222,71595.689113,7312.766134,5788.862531


In [71]:
#export selected ipps to excel
ipps_uniq.to_csv("ipps_selected.csv",sep=',', encoding='utf-8')