## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import Functions as fxns
from Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

In [2]:
# !python ../Preprocessing.py

In [3]:
claims = load('../claims.pkl')

In [4]:
claims.shape

(558211, 56)

In [5]:
claims.columns

Index(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'ClmProcedureCode_6', 'IsOutpatient', 'DOB', 'DOD', 'Gender', 'Race',
       'RenalDisease', 'State', 'County', 'NoOfMonths_PartACov',
       'NoOfMonths_PartBCov', 'Alzheimers_Chronic', 'HeartFailure_Chronic',
       'KidneyDisease_Chronic', 'Cancer_Chronic', 'ObstrPulmonary_Chronic',
       'Depression_Chronic', 'Diabetes_Chronic', 'IschemicHeart

## Claims DF

### New Columns

In [5]:
# add date cols containing only week for each column
fxns.split_date(claims, ['ClaimStartDt', 'ClaimEndDt',
                         'AdmissionDt', 'DischargeDt'])

In [6]:
claims['AgeAtClaim'] = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)

claims['HasDied'] = claims.DOD.notna()
fxns.re_encode_bool(claims, ['HasDied'])

In [7]:
chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
claims['NumConds'] = claims[chronic_cols].sum(axis=1)

In [8]:
physician_cols = \
    claims.columns[claims.columns.str.contains('Phys')].to_list()
phys_count_cols = ['HasAttPhys', 'HasNoPhys', 'AttPhysIsOperPhys']

claims['HasAttPhys'] = claims['AttendingPhysician'].notna()
claims['HasNoPhys']  = claims[physician_cols].isna().all(axis=1)
claims['AttPhysIsOperPhys'] = \
    claims.AttendingPhysician == claims.OperatingPhysician
fxns.re_encode_bool(claims, phys_count_cols)

In [9]:
claims['ClaimDuration'] = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['AdmitDuration'] = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

In [10]:
claims['ClaimCost'] = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['DailyClaimCost'] = \
    claims.ClaimCost / claims.ClaimDuration
claims['InsReimbursementRatio'] = \
    claims.InscClaimAmtReimbursed / claims.ClaimCost

In [11]:
claims.shape

(558211, 71)

### Variables

In [12]:
# lists:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]

physician_cols = \
    claims.columns[claims.columns.str.contains('Phys')].to_list()
chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols   = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols   = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

# dataframes:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

## Providers DF

In [13]:
providers = claims.groupby(['Provider', 'PotentialFraud', 'IsOutpatient'
                           ]).size().reset_index().drop(0, axis=1)

### Beneficiaries

In [14]:
providers['Perc_GenderZero'] = \
    claims.groupby(['Provider', 'IsOutpatient']).Gender.mean().values
providers['Perc_HasRenalDisease'] = \
    claims.groupby(['Provider', 'IsOutpatient']).RenalDisease.mean().values
providers['Perc_HasDied'] = \
    claims.groupby(['Provider', 'IsOutpatient']).HasDied.mean().values
providers['Mean_NumChronicConds'] = \
    claims.groupby(['Provider', 'IsOutpatient']).NumConds.mean().values

In [15]:
providers['Mean_AgeAtClaim'] = \
    claims.groupby(['Provider', 'IsOutpatient']).AgeAtClaim.mean().values
providers['AgeRange'] = \
    (claims.groupby(['Provider', 'IsOutpatient']).AgeAtClaim.max() -
        claims.groupby(['Provider', 'IsOutpatient']).AgeAtClaim.min()).values

In [16]:
other_groupbys = pd.DataFrame()

other_groupbys['Mean_ClaimCostPerPatient'] = \
    claims.groupby(['Provider', 'BeneID', 'IsOutpatient']).ClaimCost.mean(
                  ).groupby(['Provider', 'IsOutpatient']).mean()

other_groupbys['Mean_ClaimCostPerAttPhys'] = \
    claims.groupby(['Provider', 'AttendingPhysician', 'IsOutpatient']
                  ).ClaimCost.mean(
                  ).groupby(['Provider', 'IsOutpatient']).mean()

other_groupbys['Mean_ClaimCostPerOperPhys'] = \
    claims.groupby(['Provider', 'OperatingPhysician', 'IsOutpatient']
                  ).ClaimCost.mean(
                  ).groupby(['Provider', 'IsOutpatient']).mean()

other_groupbys['Mean_ClaimCostPerOtherPhys'] = \
    claims.groupby(['Provider', 'OtherPhysician', 'IsOutpatient']
                  ).ClaimCost.mean(
                  ).groupby(['Provider', 'IsOutpatient']).mean()

other_groupbys['Mean_AdmitDurationPerAttPhys'] = \
    claims.groupby(['AttendingPhysician', 'AdmitDuration', 'Provider', 'IsOutpatient']
                  ).AdmitDuration.mean().groupby(['Provider', 'IsOutpatient']).mean()

other_groupbys = other_groupbys.reset_index()

providers = providers.merge(other_groupbys, how='left', on=['Provider', 'IsOutpatient'])

In [17]:
race_by_provider = \
    claims.groupby(['Provider', 'IsOutpatient', 'Race']).ClaimID.count().reset_index()

race = pd.DataFrame()
race['race1'] = race_by_provider[race_by_provider.Race == 1
                                ].ClaimID.to_list()
race['race2'] = race_by_provider[race_by_provider.Race == 2
                                ].ClaimID.to_list()
race['race3'] = race_by_provider[race_by_provider.Race == 3
                                ].ClaimID.to_list()
race['race5'] = race_by_provider[race_by_provider.Race == 5
                                ].ClaimID.to_list()

providers['Perc_RaceOne']   = race.race1 / np.sum(race, axis=1)
providers['Perc_RaceTwo']   = race.race2 / np.sum(race, axis=1)
providers['Perc_RaceThree'] = race.race3 / np.sum(race, axis=1)

In [18]:
for col in chronic_cols:
    colname = f'Perc_{col}'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

### Doctors

In [19]:
for col in phys_count_cols:
    colname = f'Perc_{col}'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

### Money

In [20]:
providers['Sum_InscClaimAmtReimbursed'] = \
    claims.groupby(['Provider', 'IsOutpatient']).InscClaimAmtReimbursed.sum().values

providers['Sum_DeductibleAmtPaid'] = \
    claims.groupby(['Provider', 'IsOutpatient']).DeductibleAmtPaid.sum().values

In [21]:
money_cols = ['InsReimbursementRatio',
              'ClaimCost', 'DailyClaimCost',
              'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
              'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']

for col in money_cols:
    colname = f'Mean_{col}'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

### Time

In [22]:
time_cols = ['NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
 'ClaimDuration', 'AdmitDuration']

for col in time_cols:
    colname = f'Mean_{col}'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

## Pivot

In [23]:
already_ip_op_cols = ['Mean_IPAnnualReimbursementAmt', 'Mean_IPAnnualDeductibleAmt',
                'Mean_OPAnnualReimbursementAmt', 'Mean_OPAnnualDeductibleAmt']
providers.drop(already_ip_op_cols, axis=1, inplace=True)

In [24]:
ip = providers[providers.IsOutpatient == 0].add_prefix('IP_')
ip.rename(columns={'IP_Provider': 'Provider',
                   'IP_PotentialFraud': 'PotentialFraud',
                   'IP_IsOutpatient': 'IsOutpatient'},
          inplace=True)

op = providers[providers.IsOutpatient == 1].add_prefix('OP_')
op.rename(columns={'OP_Provider': 'Provider',
                   'OP_PotentialFraud': 'PotentialFraud',
                   'OP_IsOutpatient': 'IsOutpatient'},
          inplace=True)

In [25]:
already_ip_op_cols = ['IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
                'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']
already_ip_op = claims.groupby('Provider')[already_ip_op_cols].mean()

In [26]:
providers = claims.groupby('Provider').size().reset_index().drop(0, axis=1)
providers = providers.merge(op, on='Provider', how='left')
providers = providers.merge(ip, on='Provider', how='left')
providers = providers.merge(already_ip_op, on='Provider', how='left')

providers.drop(['PotentialFraud_x', 'IsOutpatient_x',
                'PotentialFraud_y', 'IsOutpatient_y',
                'OP_Mean_AdmitDuration'], axis=1, inplace=True)

pf = claims.groupby(['Provider', 'PotentialFraud']).size().reset_index().drop(0, axis=1)

providers = providers.merge(pf, on='Provider')

### Data Import

In [27]:
lucas = pd.read_csv('./data/Lucas_Providers.csv')
lucas.drop(['percentage_InOutpatients', 'percentage_noPhysician'],
           axis=1, inplace=True)
lucas.rename(columns={'IP_Mean_AttPhys': 'IP_Mean_PatientsPerAttPhys',
                      'IP_Mean_OperPhys': 'IP_Mean_PatientsPerOperPhys',
                      'IP_Mean_OthPhys': 'IP_Mean_PatientsPerOtherPhys',
                      'OP_Mean_AttPhys': 'OP_Mean_PatientsPerAttPhys',
                      'OP_Mean_OperPhys': 'OP_Mean_PatientsPerOperPhys',
                      'OP_Mean_OthPhys': 'OP_Mean_PatientsPerOtherPhys'},
             inplace=True)

In [28]:
ryan = pd.read_csv('./data/Ryan_Providers.csv')
ryan.drop(['Unnamed: 0', 'AllPhy_mean_IP', 'AllPhy_mean_OP',
           'NoPhy_mean_IP', 'NoPhy_mean_OP',
           'ClaimDuration_mean_IP', 'ClaimDuration_mean_OP',
           'InscClaimAmtReimbursed_mean_IP', 'InscClaimAmtReimbursed_mean_OP',
           'AdmisDuration_mean_IP', 'AdmisDuration_mean_OP',
           'AgeAtClm_mean_IP', 'AgeAtClm_mean_OP',
           'Chronic_Sum_mean_IP', 'Chronic_Sum_mean_OP',
           'DeductibleAmtPaid_mean_IP', 'DeductibleAmtPaid_mean_OP',
           'InsCovRatio_mean_IP', 'InsCovRatio_mean_OP',
           'RevPerDay_mean_IP', 'RevPerDay_mean_OP'],
          axis=1, inplace=True)

In [29]:
providers = providers.merge(ryan).merge(lucas)

### Post-processing

In [30]:
providers.fillna(0, inplace=True)

In [31]:
providers.rename(columns={
    'IPAnnualReimbursementAmt': 'IP_Mean_AnnualReimbursementAmt',
    'IPAnnualDeductibleAmt': 'IP_Mean_AnnualDeductibleAmt',
    'OPAnnualReimbursementAmt': 'OP_Mean_AnnualReimbursementAmt',
    'OPAnnualDeductibleAmt': 'OP_Mean_AnnualDeductibleAmt',
    'isTop5admtcode': 'Perc_HasTop5AdmitCode',
    'Claim_Patient_Ratio': 'Ratio_ClaimsPerPatient',
    'Patient_Attphy_Ratio': 'PatientsPerAttPhys',
    'Patient_Operphy_Ratio': 'PatientsPerOperPhys',
    'Patient_Otherphy_Ratio': 'PatientsPerOthPhys',
    'Claim_AttPhy_Ratio': 'Ratio_ClaimsPerAttPhys',
    'IsOutpatient_Perc': 'Perc_Outpatient',
    'BeneID_Nunique_IP': 'IP_Count_UniquePatients',
    'State_Nunique_IP': 'IP_Count_UniqueState',
    'BeneID_Nunique_OP': 'OP_Count_UniquePatients',
    'State_Nunique_OP': 'OP_Count_UniqueState',
    'Att_Phy_Mult_Prec': 'Perc_MultHospAttPhys',
    'Oper_Phy_Mult_Prec': 'Perc_MultHospOperPhys',
    'Other_Phy_Mult_Prec': 'Perc_MultHospOtherPhys',
    'IP_Multiple_Hospital_Prec': 'IP_Perc_MultHosp',
    'OP_Multiple_Hospital_Prec': 'OP_Perc_MultHosp',
    'Provider_Serve_BothIO': 'DualPatientProvider',
    'Bene_Receive_Both_IO_Perc': 'Perc_DualPatientType',
    'IP_Dup_Perc': 'IP_Perc_Duplicates',
    'OP_Dup_Perc': 'OP_Perc_Duplicates',
    'IP_No_Proc_Perc': 'IP_Perc_No_ProcCode',
    'OP_No_Diag_Perc': 'OP_Perc_No_DiagCode',
    'In_Top5_State_Perc': 'Perc_ClaimsPerTopFraudState'}, inplace=True);

In [32]:
providers = providers[[
    'Provider',
    'PotentialFraud',
    'Perc_Outpatient',
    'DualPatientProvider',
    'Perc_DualPatientType',
    'Ratio_ClaimsPerAttPhys',
    'Ratio_ClaimsPerPatient',
    'PatientsPerAttPhys',
    'PatientsPerOperPhys',
    'PatientsPerOthPhys',
    'Perc_MultHospAttPhys',
    'Perc_MultHospOperPhys',
    'Perc_MultHospOtherPhys',
    'Perc_HasTop5AdmitCode',
    'Perc_ClaimsPerTopFraudState',
    'Mean_StatePerAttPhys',
    'Mean_StatePerOperPhys',
    'Mean_StatePerOthPhys',
    'IP_Count_UniquePatients',
    'IP_Perc_MultHosp',
    'IP_Perc_Duplicates',
    'IP_Mean_Duplicate_per_AttPhy',
    'IP_Mean_Duplicate_per_Patient',
    'IP_Perc_Dup_Diff_Provider',
    'IP_Perc_Dup_Diff_State',
    'IP_Count_UniqueState',
    'IP_Mean_PatientsPerAttPhys',
    'IP_Mean_PatientsPerOperPhys',
    'IP_Mean_PatientsPerOtherPhys',
    'IP_Perc_HasAttPhys',
    'IP_Perc_HasNoPhys',
    'IP_Perc_AttPhysIsOperPhys',
    'IP_Mean_ClaimCost',
    'IP_Mean_DailyClaimCost',
    'IP_Mean_ClaimCostPerAttPhys',
    'IP_Mean_ClaimCostPerOperPhys',
    'IP_Mean_ClaimCostPerOtherPhys',
    'IP_Mean_ClaimCostPerPatient',
    'IP_Perc_No_ProcCode',
    'IP_Sum_DeductibleAmtPaid',
    'IP_Mean_AnnualDeductibleAmt',
    'IP_Sum_InscClaimAmtReimbursed',
    'IP_Mean_InsReimbursementRatio',
    'IP_Mean_AnnualReimbursementAmt',
    'IP_Mean_NoOfMonths_PartACov',
    'IP_Mean_NoOfMonths_PartBCov',
    'IP_Mean_ClaimDuration',
    'IP_Mean_AdmitDuration',
    'IP_Mean_AdmitDurationPerAttPhys',
    'IP_AgeRange',
    'IP_Perc_HasDied',
    'IP_Perc_GenderZero',
    'IP_Perc_RaceOne',
    'IP_Perc_RaceThree',
    'IP_Perc_RaceTwo',
    'IP_Perc_HasRenalDisease',
    'IP_Mean_NumChronicConds',
    'IP_Perc_Alzheimers_Chronic',
    'IP_Perc_Cancer_Chronic',
    'IP_Perc_Depression_Chronic',
    'IP_Perc_Diabetes_Chronic',
    'IP_Perc_HeartFailure_Chronic',
    'IP_Perc_IschemicHeart_Chronic',
    'IP_Perc_KidneyDisease_Chronic',
    'IP_Perc_ObstrPulmonary_Chronic',
    'IP_Perc_Osteoporosis_Chronic',
    'IP_Perc_RheumatoidArthritis_Chronic',
    'IP_Perc_Stroke_Chronic',
    'OP_Count_UniquePatients',
    'OP_Perc_MultHosp',
    'OP_Perc_Duplicates',
    'OP_Mean_Duplicate_per_AttPhy',
    'OP_Mean_Duplicate_per_Patient',
    'OP_Perc_Dup_Diff_Provider',
    'OP_Perc_Dup_Diff_State',
    'OP_Count_UniqueState',
    'OP_Mean_PatientsPerAttPhys',
    'OP_Mean_PatientsPerOperPhys',
    'OP_Mean_PatientsPerOtherPhys',
    'OP_Perc_HasNoPhys',
    'OP_Perc_AttPhysIsOperPhys',
    'OP_Perc_HasAttPhys',
    'OP_Mean_ClaimCost',
    'OP_Mean_DailyClaimCost',
    'OP_Mean_ClaimCostPerAttPhys',
    'OP_Mean_ClaimCostPerOperPhys',
    'OP_Mean_ClaimCostPerOtherPhys',
    'OP_Mean_ClaimCostPerPatient',
    'OP_Perc_No_DiagCode',
    'OP_Sum_DeductibleAmtPaid',
    'OP_Mean_AnnualDeductibleAmt',
    'OP_Sum_InscClaimAmtReimbursed',
    'OP_Mean_InsReimbursementRatio',
    'OP_Mean_AnnualReimbursementAmt',
    'OP_Mean_NoOfMonths_PartACov',
    'OP_Mean_NoOfMonths_PartBCov',
    'OP_Mean_ClaimDuration',
    'OP_AgeRange',
    'OP_Perc_HasDied',
    'OP_Perc_GenderZero',
    'OP_Perc_RaceOne',
    'OP_Perc_RaceThree',
    'OP_Perc_RaceTwo',
    'OP_Perc_HasRenalDisease',
    'OP_Mean_NumChronicConds',
    'OP_Perc_Alzheimers_Chronic',
    'OP_Perc_Cancer_Chronic',
    'OP_Perc_Depression_Chronic',
    'OP_Perc_Diabetes_Chronic',
    'OP_Perc_HeartFailure_Chronic',
    'OP_Perc_IschemicHeart_Chronic',
    'OP_Perc_KidneyDisease_Chronic',
    'OP_Perc_ObstrPulmonary_Chronic',
    'OP_Perc_Osteoporosis_Chronic',
    'OP_Perc_RheumatoidArthritis_Chronic',
    'OP_Perc_Stroke_Chronic',
]]

## Export

In [33]:
# dump(providers, './data/Providers_Final.pkl')

In [34]:
providers.shape

(5410, 116)