## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import load

import Functions as fxns
from Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

## Claims DF

In [2]:
# !python ../Preprocessing.py

In [3]:
claims = load('../claims.pkl')

### New Columns

In [4]:
claims['AgeAtService'] = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)
claims['HasDied'] = claims.DOD.notna()

physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
claims['HasAnyPhysician'] = claims[physician_cols].notna().any(axis=1)
claims['HasAllPhysicians'] = claims[physician_cols].notna().all(axis=1)

claims['ClaimDuration'] = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['IPDuration'] = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]
claims['IPClaimDuration'] = \
    (inpatient_claims.ClaimEndDt
     - inpatient_claims.ClaimStartDt).dt.days + 1
claims['OPClaimDuration'] = \
    (outpatient_claims.ClaimEndDt
     - outpatient_claims.ClaimStartDt).dt.days + 1

claims['ClaimCost'] = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['PercInsCovered'] = \
    round((claims.InscClaimAmtReimbursed / claims.ClaimCost) * 100)
claims['DailyClaimCost'] = \
    round(claims.ClaimCost / claims.ClaimDuration)

In [5]:
# add date cols containing only day, week, year for each column
fxns.split_date(claims, ['ClaimStartDt', 'ClaimEndDt',
                         'AdmissionDt', 'DischargeDt'])

In [6]:
# number of claims per beneficiary
claims = fxns.add_count_per_col(
    claims, 'BeneID', 'ClaimID', 'ClaimsPerBene')

# number of claims per attending
claims = fxns.add_count_per_col(
    claims, 'AttendingPhysician', 'ClaimID', 'ClaimsPerAttendingPhysician')

# number of claims per operating
claims = fxns.add_count_per_col(
    claims, 'OperatingPhysician', 'ClaimID', 'ClaimsPerOperatingPhysician')

# number of claims per otherphys
claims = fxns.add_count_per_col(
    claims, 'OtherPhysician', 'ClaimID', 'ClaimsPerOtherPhysician')

In [7]:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

# Mean amount of reimbursed claims
inpatient_claims = fxns.add_mean_per_col(
    inpatient_claims, 'BeneID', 'InscClaimAmtReimbursed', 'ReimbPerIP')
outpatient_claims = fxns.add_mean_per_col(
    outpatient_claims, 'BeneID', 'InscClaimAmtReimbursed', 'ReimbPerOP')
claims = pd.concat([inpatient_claims, outpatient_claims])

# Mean deductible paid for inpatients/outpatients
inpatient_claims = fxns.add_mean_per_col(
    inpatient_claims, 'BeneID', 'DeductibleAmtPaid', 'DeductiblePerIP')
outpatient_claims = fxns.add_mean_per_col(
    outpatient_claims, 'BeneID', 'DeductibleAmtPaid', 'DeductiblePerOP')
claims = pd.concat([inpatient_claims, outpatient_claims])

# Mean chronic conditions for inpatients/outpatients

# avg Insurance covered Ratio for inpatients (PercInsCovered)
# avg Insurance covered Ratio for outpatients (PercInsCovered)

# Avg revenue per day for inpatients
# Avg revenue per day for outpatients

# Number of states for inpatient per provider
# Number of states for outpatient per provider

### Pre-processing

In [8]:
# encode bool colums to 0/1
fxns.re_encode_bool(claims,
    ['HasDied', 'HasAnyPhysician', 'HasAllPhysicians'])

In [9]:
# drop ChronicCond_ prefix from applicable cols
fxns.drop_chronic_prefix(claims)

In [10]:
# change applicable cols to dtype category
fxns.to_category_dtype(claims)

### Variables

In [11]:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]

physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
chronic_cols   = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

In [12]:
# print(claims.columns, '\n')

# Can't use, but have IPDuration count/mean/sum:
#     'AdmissionDt', 'AdmissionDt_Week',
#     'DischargeDt', 'DischargeDt_Week'

# Can't use, but have ClaimDuration count/mean/sum:
#     'ClaimEndDt', 'ClaimEndDt_Week', 'ClaimID'
#     'ClaimStartDt', 'ClaimStartDt_Week',


# Can't use, but have AgeAtService mean/sum and HasDied sum/ratio:
#     'DOB', 'DOD'

### Summary Data

In [13]:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

In [14]:
groupby_providers   = claims.groupby('Provider')

In [15]:
mean_by_provider    = \
    groupby_providers.mean().round(2).add_suffix('_mean').reset_index()

In [16]:
# need conditions by ratio, not count
chronic_by_provider = claims[['Provider'] + chronic_cols
    ].groupby('Provider').sum().add_suffix('_count').reset_index()

In [17]:
providers = pd.merge(mean_by_provider, chronic_by_provider, on='Provider')
providers.sort_index(axis=1, inplace=True)

In [18]:
print(providers.shape, '\n')
print(providers.columns)
providers.sample(5)

(5410, 36) 

Index(['AgeAtService_mean', 'Alzheimers_Chronic_count', 'Cancer_Chronic_count',
       'ClaimCost_mean', 'ClaimDuration_mean',
       'ClaimsPerAttendingPhysician_mean', 'ClaimsPerBene_mean',
       'ClaimsPerOperatingPhysician_mean', 'ClaimsPerOtherPhysician_mean',
       'DailyClaimCost_mean', 'DeductibleAmtPaid_mean', 'DeductiblePerIP_mean',
       'DeductiblePerOP_mean', 'Depression_Chronic_count',
       'Diabetes_Chronic_count', 'HeartFailure_Chronic_count',
       'IPAnnualDeductibleAmt_mean', 'IPAnnualReimbursementAmt_mean',
       'IPClaimDuration_mean', 'IPDuration_mean',
       'InscClaimAmtReimbursed_mean', 'IschemicHeart_Chronic_count',
       'KidneyDisease_Chronic_count', 'NoOfMonths_PartACov_mean',
       'NoOfMonths_PartBCov_mean', 'OPAnnualDeductibleAmt_mean',
       'OPAnnualReimbursementAmt_mean', 'OPClaimDuration_mean',
       'ObstrPulmonary_Chronic_count', 'Osteoporosis_Chronic_count',
       'PercInsCovered_mean', 'Provider', 'ReimbPerIP_mean', 'Rei

Unnamed: 0,AgeAtService_mean,Alzheimers_Chronic_count,Cancer_Chronic_count,ClaimCost_mean,ClaimDuration_mean,ClaimsPerAttendingPhysician_mean,ClaimsPerBene_mean,ClaimsPerOperatingPhysician_mean,ClaimsPerOtherPhysician_mean,DailyClaimCost_mean,DeductibleAmtPaid_mean,DeductiblePerIP_mean,DeductiblePerOP_mean,Depression_Chronic_count,Diabetes_Chronic_count,HeartFailure_Chronic_count,IPAnnualDeductibleAmt_mean,IPAnnualReimbursementAmt_mean,IPClaimDuration_mean,IPDuration_mean,InscClaimAmtReimbursed_mean,IschemicHeart_Chronic_count,KidneyDisease_Chronic_count,NoOfMonths_PartACov_mean,NoOfMonths_PartBCov_mean,OPAnnualDeductibleAmt_mean,OPAnnualReimbursementAmt_mean,OPClaimDuration_mean,ObstrPulmonary_Chronic_count,Osteoporosis_Chronic_count,PercInsCovered_mean,Provider,ReimbPerIP_mean,ReimbPerOP_mean,RheumatoidArthritis_Chronic_count,Stroke_Chronic_count
1590,75.5,2,0,8318.0,3.75,5.0,7.25,1.0,,2444.0,1068.0,1068.0,,1,1,3,1335.0,9535.0,3.75,3.75,7250.0,3,2,12.0,12.0,647.5,1455.0,,2,1,84.0,PRV52976,6500.0,,3,2
742,75.5,0,2,490.0,1.0,4.0,2.0,1.0,3.0,490.0,0.0,,0.0,2,2,2,534.0,2000.0,,,490.0,4,0,12.0,9.0,85.0,592.5,1.0,2,2,100.0,PRV51925,,485.83,3,0
4622,72.29,4,2,125.71,1.0,2.43,8.0,1.0,2.0,125.71,0.0,,0.95,2,5,4,915.43,13611.43,,,125.71,5,4,12.0,12.0,1597.14,4298.57,1.0,2,5,100.0,PRV56780,,319.95,1,1
3611,74.5,1,0,85.0,1.0,1.0,4.5,,1.0,85.0,0.0,,0.0,1,2,1,0.0,0.0,,,85.0,1,0,12.0,12.0,585.0,575.0,1.0,0,1,100.0,PRV55527,,67.5,0,0
5085,80.18,3,0,740.91,2.91,3.18,3.64,1.0,1.67,688.09,7.27,,7.27,2,7,7,679.64,3698.18,,,733.64,8,6,10.91,12.0,644.55,2811.82,2.91,4,6,99.82,PRV57367,,735.45,1,4


## New Columns

### Beneficiaries

In [19]:
# providers['IPClaims_count'] = \
#     inpatient_claims.groupby('Provider').IsOutpatient.count().values
# providers['OPClaims_count'] = \
#     outpatient_claims.groupby('Provider').IsOutpatient.count().values

# Ratio of inpatient claims can be determined from the above - do we need a separate col for it?

In [20]:
def binary_percentages(df, col, group1, group2):
    '''
    Adds a new column to a df that lists the ratio of one value from a
    binary column in the same df. (The ratio of the other value in the
    binary column is the inverse.)
    
    Arguments: A single dataframe.
    
    Output: None.
    
    Returns: Dataframe altered in place.
    '''
    by_provider = df.groupby(['Provider', col]
                            ).ClaimID.count().reset_index()

    colname = pd.DataFrame()
    colname[group1] = by_provider[by_provider[col] == 1].ClaimID.to_list()
    colname[group2] = by_provider[by_provider[col] == 0].ClaimID.to_list()
    
    newcolname1 = f'{group1}_perc'
    
    providers[newcolname1] = round(
        (colname[group1] / (colname[group1] + colname[group2])) * 100, 2)

In [21]:
binary_percentages(claims, 'Gender', 'GenderZero', 'GenderOne')
binary_percentages(claims, 'RenalDisease', 'HasRenalDisease', 'NotRenalDisease')
binary_percentages(claims, 'HasDied', 'HasDied', 'NotDead')

In [22]:
race_by_provider = \
    claims.groupby(['Provider', 'Race']).ClaimID.count().reset_index()

race = pd.DataFrame()
race['race1'] = race_by_provider[race_by_provider.Race == 1
                                ].ClaimID.to_list()
race['race0'] = race_by_provider[race_by_provider.Race == 0
                                ].ClaimID.to_list()
race['race3'] = race_by_provider[race_by_provider.Race == 3
                                ].ClaimID.to_list()
race['race5'] = race_by_provider[race_by_provider.Race == 5
                                ].ClaimID.to_list()

providers['RaceZero_perc']  = round(
    (race.race0 / np.sum(race, axis=1) * 100), 2)
providers['RaceThree_perc'] = round(
    (race.race3 / np.sum(race, axis=1) * 100), 2)
providers['RaceFive_perc']  = round(
    (race.race5 / np.sum(race, axis=1) * 100), 2)

In [23]:
# DO WE WANT CHRONIC CONDITION RATIOS? WHY/WHY NOT?
# Ratio of inpatients with top 5 frequent chronic disease (from PotentialFraud)
# Ratio of outpatients with top 5 frequent chronic disease (from PotentialFraud)

In [24]:
ratio_cols = providers.columns[providers.columns.str.contains('_ratio')
                              ].to_list()
providers[ratio_cols].sample(5).iloc[:,-5:]

4539
1408
4836
3577
2857


### Inpatients/Outpatiens

In [25]:
binary_percentages(claims, 'IsOutpatient', 'IsOutpatient', 'Inpatient')

In [26]:
# Whether the provider serves both in/out patients
# Based on providers.IsOutpatient_ratio, do we need this?

### Doctors

In [27]:
binary_percentages(claims, 'HasAnyPhysician', 'HasAnyPhysician', 'HasNoPhysician')
binary_percentages(claims, 'HasAllPhysicians', 'HasAllPhysicians', 'HasSomePhysicians')

### Codes

### Money

### Time

### Location

### Size

## MISC/NOTES

In [28]:
providers.columns[providers.columns.str.contains('Chronic')].to_list()

['Alzheimers_Chronic_count',
 'Cancer_Chronic_count',
 'Depression_Chronic_count',
 'Diabetes_Chronic_count',
 'HeartFailure_Chronic_count',
 'IschemicHeart_Chronic_count',
 'KidneyDisease_Chronic_count',
 'ObstrPulmonary_Chronic_count',
 'Osteoporosis_Chronic_count',
 'RheumatoidArthritis_Chronic_count',
 'Stroke_Chronic_count']

In [29]:
# Ratio of attending physicians serving for different hospitals (75% threshold)
# Ratio of operating physicians serving for different hospitals (75% threshold)
# Ratio of other physicians serving for different hospitals (75% threshold)
# Ratio of inpatients going to different hospitals (75% threshold)
# Ratio of outpatients going to different hospitals (75% threshold)


# MBA, imbalanced data sets, classification modeling