## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import load

import Sita_Functions as fxns
from Sita_Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

## Claims DF

In [2]:
# !python Sita_Preprocessing.py

In [3]:
claims = load('claims.pkl')

### New Columns

In [4]:
physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()

# add columns
claims['AgeAtService']     = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)
claims['HasDied']          = claims.DOD.notna()
claims['HasAnyPhysician']  = claims[physician_cols].notna().any(axis=1)
claims['HasAllPhysicians'] = claims[physician_cols].notna().all(axis=1)
claims['ClaimCost']        = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['ClaimDuration']    = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['IPDuration']       = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

### Pre-processing

In [5]:
# encode bool colums to 0/1
fxns.re_encode_bool(claims, ['HasDied', 'HasAnyPhysician', 'HasAllPhysicians'])

In [6]:
# add date cols containing only day, week, year
fxns.split_date(claims, ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt'])

In [7]:
# drop ChronicCond_ prefix from applicable cols
fxns.drop_chronic_prefix(claims)

In [8]:
# change object cols to dtype category
fxns.to_category_dtype(claims)

### Variables

In [9]:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]

physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
chronic_cols   = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

### Summary Data

In [10]:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

In [11]:
groupby_providers   = claims.groupby('Provider')

In [12]:
mean_by_provider    = \
    groupby_providers.mean().add_suffix('_mean').reset_index()

In [13]:
sum_by_provider     = \
    groupby_providers.sum().add_suffix('_sum').reset_index()

In [14]:
chronic_by_provider = claims[['Provider'] + chronic_cols
                            ].groupby('Provider').sum().reset_index()

In [15]:
# need to get unique counts
count_by_provider = groupby_providers[
    ['BeneID', 'ClaimID', 'County', 'State', 'IPDuration']
    + physician_cols + diagnosis_cols + procedure_cols
    ].count().add_suffix('_count').reset_index()

# ADD PHYSICIAN COLS (NONE/MULT), RATIOS, SUMS OF EACH
# CLAIM ADMIT CODE, DIAGNOSIS GROUP CODE?

In [16]:
providers = pd.merge(mean_by_provider, sum_by_provider, on='Provider')
providers = pd.merge(providers, count_by_provider, on='Provider')
providers = pd.merge(providers, chronic_by_provider, on='Provider')
providers.sort_index(axis=1, inplace=True)

In [17]:
# print(claims.columns, '\n')

# Can't use, but have IPDuration count/mean/sum:
#     'AdmissionDt', 'AdmissionDt_Week',
#     'DischargeDt', 'DischargeDt_Week'

# Can't use, but have ClaimDuration count/mean/sum:
#     'ClaimEndDt', 'ClaimEndDt_Week', 'ClaimID'
#     'ClaimStartDt', 'ClaimStartDt_Week',


# Can't use, but have AgeAtService mean/sum and HasDied sum/ratio:
#     'DOB', 'DOD'

In [18]:
print(providers.shape, '\n')
print(providers.columns)

(5410, 64) 

Index(['AgeAtService_mean', 'AgeAtService_sum', 'Alzheimers_Chronic',
       'AttendingPhysician_count', 'BeneID_count', 'Cancer_Chronic',
       'ClaimCost_mean', 'ClaimCost_sum', 'ClaimDuration_mean',
       'ClaimDuration_sum', 'ClaimID_count', 'ClmAdmitDiagnosisCode_count',
       'ClmDiagnosisCode_10_count', 'ClmDiagnosisCode_1_count',
       'ClmDiagnosisCode_2_count', 'ClmDiagnosisCode_3_count',
       'ClmDiagnosisCode_4_count', 'ClmDiagnosisCode_5_count',
       'ClmDiagnosisCode_6_count', 'ClmDiagnosisCode_7_count',
       'ClmDiagnosisCode_8_count', 'ClmDiagnosisCode_9_count',
       'ClmProcedureCode_1_count', 'ClmProcedureCode_2_count',
       'ClmProcedureCode_3_count', 'ClmProcedureCode_4_count',
       'ClmProcedureCode_5_count', 'ClmProcedureCode_6_count', 'County_count',
       'DeductibleAmtPaid_mean', 'DeductibleAmtPaid_sum', 'Depression_Chronic',
       'Diabetes_Chronic', 'DiagnosisGroupCode_count',
       'HasAllPhysicians_count', 'HasAnyPhysician_co

## New Columns

### Beneficiaries

In [19]:
def binary_ratios(df, col, group1, group2):
    by_provider = df.groupby(['Provider', col]).ClaimID.count().reset_index()

    colname = pd.DataFrame()
    colname[group1] = by_provider[by_provider[col] == 1].ClaimID.to_list()
    colname[group2] = by_provider[by_provider[col] == 0].ClaimID.to_list()
    
    newcolname1 = f'{group1}_ratio'
    
    providers[newcolname1] = round((colname[group1] / (colname[group1] + colname[group2])) * 100, 2)

binary_ratios(claims, 'Gender', 'GenderZero', 'GenderOne')
binary_ratios(claims, 'IsOutpatient', 'IsOutpatient', 'Inpatient')
binary_ratios(claims, 'RenalDisease', 'HasRenalDisease', 'NotRenalDisease')
binary_ratios(claims, 'HasDied', 'HasDied', 'NotDead')

In [20]:
race_by_provider = claims.groupby(['Provider', 'Race']).ClaimID.count().reset_index()

race = pd.DataFrame()
race['race1'] = race_by_provider[race_by_provider.Race == 1].ClaimID.to_list()
race['race0'] = race_by_provider[race_by_provider.Race == 0].ClaimID.to_list()
race['race3'] = race_by_provider[race_by_provider.Race == 3].ClaimID.to_list()
race['race5'] = race_by_provider[race_by_provider.Race == 5].ClaimID.to_list()

providers['RaceZero_ratio']   = round((race.race0 / np.sum(race, axis=1) * 100), 2)
providers['RaceThree_ratio'] = round((race.race3 / np.sum(race, axis=1) * 100), 2)
providers['RaceFive_ratio']  = round((race.race5 / np.sum(race, axis=1) * 100), 2)

In [21]:
# DO WE WANT CHRONIC CONDITION RATIOS? WHY/WHY NOT?
# DO WE WANT DIAGNOSIS CODE RATIOS? WHY/WHY NOT?
# DO WE WANT PROCEDURE CODE RATIOS? WHY/WHY NOT?

In [22]:
ratio_cols = providers.columns[providers.columns.str.contains('_ratio')].to_list()
providers[['Provider'] + ratio_cols].sample(5)

Unnamed: 0,Provider,GenderZero_ratio,IsOutpatient_ratio,HasRenalDisease_ratio,HasDied_ratio,RaceZero_ratio,RaceThree_ratio,RaceFive_ratio
4771,PRV56978,50.0,100.0,13.33,0.83,7.5,5.83,6.67
5367,PRV57717,39.47,86.84,19.3,0.0,7.89,0.88,9.65
1722,PRV53148,33.33,100.0,26.67,0.0,0.0,0.0,0.0
3518,PRV55416,43.84,97.26,16.44,2.74,0.0,5.48,0.0
4684,PRV56867,44.44,100.0,22.22,0.0,11.11,0.0,0.0


### Doctors

### Codes

### Money

### Time

### Location