## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import dump, load

import Functions as fxns
from Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

In [2]:
# !python ../Preprocessing.py

In [3]:
claims = load('../claims.pkl')

## Claims DF

### New Columns

In [4]:
claims['AgeAtClaim'] = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)

claims['HasDied'] = claims.DOD.notna()
fxns.re_encode_bool(claims, ['HasDied'])

chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
claims['NumConds'] = claims[chronic_cols].sum(axis=1)

In [5]:
physician_cols = \
    claims.columns[claims.columns.str.contains('Phys')].to_list()
phys_count_cols  = ['HasAllPhys', 'HasNoPhys']

claims['HasAllPhys'] = claims[physician_cols].notna().all(axis=1)
claims['HasNoPhys']  = claims[physician_cols].isna().all(axis=1)

fxns.re_encode_bool(claims, phys_count_cols)

In [6]:
claims['ClaimDuration'] = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['AdmitDuration']    = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

In [7]:
claims['ClaimCost'] = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['DailyClaimCost'] = \
    claims.ClaimCost / claims.ClaimDuration
claims['InsReimbursementRatio'] = \
    claims.InscClaimAmtReimbursed / claims.ClaimCost

### Variables

In [8]:
# lists:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]

physician_cols = \
    claims.columns[claims.columns.str.contains('Phys')].to_list()
chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols   = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols   = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

# dataframes:
# outpatient_claims = claims.loc[claims.IsOutpatient == 1]
# inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

## Providers DF

In [9]:
providers = claims.groupby(['Provider', 'PotentialFraud', 'IsOutpatient'
                           ]).size().reset_index().drop(0, axis=1)

### Beneficiaries

In [10]:
providers['Perc_GenderZero'] = \
    claims.groupby(['Provider', 'IsOutpatient']).Gender.mean().values
providers['Perc_HasRenalDisease'] = \
    claims.groupby(['Provider', 'IsOutpatient']).RenalDisease.mean().values
providers['Perc_Mean_AgeAtClaim'] = \
    claims.groupby(['Provider', 'IsOutpatient']).AgeAtClaim.mean().values
providers['Perc_HasDied'] = \
    claims.groupby(['Provider', 'IsOutpatient']).HasDied.mean().values
providers['Perc_Mean_NumConds'] = \
    claims.groupby(['Provider', 'IsOutpatient']).NumConds.mean().values
providers['Perc_InsReimbursementRatio'] = \
    claims.groupby(['Provider', 'IsOutpatient']).InsReimbursementRatio.mean().values

In [11]:
race_by_provider = \
    claims.groupby(['Provider', 'IsOutpatient', 'Race']).ClaimID.count().reset_index()

race = pd.DataFrame()
race['race1'] = race_by_provider[race_by_provider.Race == 1
                                ].ClaimID.to_list()
race['race2'] = race_by_provider[race_by_provider.Race == 2
                                ].ClaimID.to_list()
race['race3'] = race_by_provider[race_by_provider.Race == 3
                                ].ClaimID.to_list()
race['race5'] = race_by_provider[race_by_provider.Race == 5
                                ].ClaimID.to_list()

providers['Perc_RaceOne']   = race.race1 / np.sum(race, axis=1)
providers['Perc_RaceTwo']   = race.race2 / np.sum(race, axis=1)
providers['Perc_RaceThree'] = race.race3 / np.sum(race, axis=1)

In [12]:
for col in chronic_cols:
    colname = f'Perc_{col}'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

### Doctors

In [13]:
for col in phys_count_cols:
    colname = f'Perc_{col}'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

### Money

In [14]:
money_cols = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid',
              'ClaimCost', 'DailyClaimCost',
              'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
              'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']

for col in money_cols:
    colname = f'Mean_{col}'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

### Misc

In [15]:
mean_cols = ['NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
 'ClaimDuration', 'AdmitDuration']

for col in mean_cols:
    colname = f'Mean_{col}'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

## Pivot

In [16]:
already_ip_op_cols = ['Mean_IPAnnualReimbursementAmt', 'Mean_IPAnnualDeductibleAmt',
                'Mean_OPAnnualReimbursementAmt', 'Mean_OPAnnualDeductibleAmt']
providers.drop(already_ip_op_cols, axis=1, inplace=True)

In [17]:
ip = providers[providers.IsOutpatient == 0].add_prefix('IP_')
ip.rename(columns={'IP_Provider': 'Provider',
                   'IP_PotentialFraud': 'PotentialFraud',
                   'IP_IsOutpatient': 'IsOutpatient'},
          inplace=True)

op = providers[providers.IsOutpatient == 1].add_prefix('OP_')
op.rename(columns={'OP_Provider': 'Provider',
                   'OP_PotentialFraud': 'PotentialFraud',
                   'OP_IsOutpatient': 'IsOutpatient'},
          inplace=True)

In [18]:
already_ip_op_cols = ['IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
                'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']
already_ip_op = claims.groupby('Provider')[already_ip_op_cols].mean()

In [19]:
providers = claims.groupby('Provider').size().reset_index().drop(0, axis=1)
providers = providers.merge(op, on='Provider', how='left')
providers = providers.merge(ip, on='Provider', how='left')
providers = providers.merge(already_ip_op, on='Provider', how='left')

providers.drop(['PotentialFraud_x', 'IsOutpatient_x',
                'PotentialFraud_y', 'IsOutpatient_y',
                'OP_Mean_AdmitDuration'], axis=1, inplace=True)
pf = claims.groupby(['Provider', 'PotentialFraud']).size().reset_index().drop(0, axis=1)

providers = providers.merge(pf, on='Provider')

### Data Import

In [20]:
lucas = pd.read_csv('./data/Lucas_Providers.csv')
lucas.drop('percentage_noPhysician', axis=1, inplace=True)
lucas.columns = \
    ['Provider', 'Perc_DualPatientType', 'HasTop5AdmtCode']

In [21]:
ryan = pd.read_csv('./data/Ryan_Providers.csv')
ryan.drop(['Unnamed: 0', 'AllPhy_mean_IP', 'AllPhy_mean_OP',
           'NoPhy_mean_IP', 'NoPhy_mean_OP',
           'ClaimDuration_mean_IP', 'ClaimDuration_mean_OP',
           'InscClaimAmtReimbursed_mean_IP', 'InscClaimAmtReimbursed_mean_OP',
           'AdmisDuration_mean_IP', 'AdmisDuration_mean_OP',
           'AgeAtClm_mean_IP', 'AgeAtClm_mean_OP',
           'DeductibleAmtPaid_mean_IP', 'DeductibleAmtPaid_mean_OP',
           'InsCovRatio_mean_IP', 'InsCovRatio_mean_OP',
           'RevPerDay_mean_IP', 'RevPerDay_mean_OP',
           'Bene_Receive_Both_IO_Perc'],
          axis=1, inplace=True)
ryan.rename(columns = {'Patient_Attphy_Ratio': 'PatientsPerAttPhys',
                'Patient_Operphy_Ratio': 'PatientsPerOperPhys',
                'Patient_Otherphy_Ratio': 'PatientsPerOthPhys'},
            inplace=True)

In [22]:
providers = providers.merge(ryan).merge(lucas)

### Post-processing

In [23]:
providers.fillna(0, inplace=True)

In [24]:
providers.rename(columns={
    'IPAnnualReimbursementAmt': 'IP_AnnualReimbursementAmt',
    'IPAnnualDeductibleAmt': 'IP_AnnualDeductibleAmt',
    'OPAnnualReimbursementAmt': 'OP_AnnualReimbursementAmt',
    'OPAnnualDeductibleAmt': 'OP_AnnualDeductibleAmt',
    'Claim_Patient_Ratio': 'Ratio_ClaimsPerPatient',
    'Claim_AttPhy_Ratio': 'Ratio_ClaimsPerAttPhys',
    'IsOutpatient_Perc': 'Perc_Outpatient',
    'BeneID_Nunique_IP': 'IP_Count_UniquePatients',
    'State_Nunique_IP': 'IP_Count_UniqueState',
    'BeneID_Nunique_OP': 'OP_Count_UniquePatients',
    'State_Nunique_OP': 'OP_Count_UniqueState',
    'Chronic_Sum_mean_IP': 'IP_Mean_SumChronicConds',
    'Chronic_Sum_mean_OP': 'OP_Mean_SumChronicConds',
    'Att_Phy_Mult_Prec': 'Perc_AttPhysMultHosp',
    'Oper_Phy_Mult_Prec': 'Perc_OperPhysMultHosp',
    'Other_Phy_Mult_Prec': 'Perc_OtherPhysMultHosp',
    'IP_Multiple_Hospital_Prec': 'IP_Perc_MultHosp',
    'OP_Multiple_Hospital_Prec': 'OP_Perc_MultHosp',
    'Provider_Serve_BothIO': 'Perc_DualPatientProvider',
    'IP_Dup_Perc': 'IP_Perc_Duplicates',
    'OP_Dup_Perc': 'OP_Perc_Duplicates',
    'IP_No_Proc_Perc': 'IP_Perc_No_ProcCode',
    'OP_No_Diag_Perc': 'OP_Perc_No_DiagCode',
    'In_Top5_State_Perc': 'Perc_ClaimsPerTopFraudState',
    'DualPatientType': 'Perc_DualPatientType'}, inplace=True);

In [25]:
providers.columns =[
    'Provider',
    'PotentialFraud',
    'Perc_Outpatient',
    'HasTop5AdmtCode',
    'PatientsPerAttPhys',
    'PatientsPerOperPhys',
    'PatientsPerOthPhys',
    'Perc_MultHospAttPhys',
    'Perc_ClaimsPerTopFraudState',
    'Perc_DualPatientProvider',
    'Perc_DualPatientType',
    'Perc_MultHospOperPhys',
    'Perc_MultHospOtherPhys',
    'Ratio_ClaimsPerAttPhys',
    'Ratio_ClaimsPerPatient',
    'IP_AnnualDeductibleAmt',
    'IP_AnnualReimbursementAmt',
    'IP_Count_UniquePatients',
    'IP_Count_UniqueState',
    'IP_Mean_AdmitDuration',
    'IP_Mean_ClaimCost',
    'IP_Mean_ClaimDuration',
    'IP_Mean_DailyClaimCost',
    'IP_Mean_DeductibleAmtPaid',
    'IP_Mean_InscClaimAmtReimbursed',
    'IP_Mean_NoOfMonths_PartACov',
    'IP_Mean_NoOfMonths_PartBCov',
    'IP_Mean_SumChronicConds',
    'IP_Perc_Alzheimers_Chronic',
    'IP_Perc_Cancer_Chronic',
    'IP_Perc_Depression_Chronic',
    'IP_Perc_Diabetes_Chronic',
    'IP_Perc_Dup',
    'IP_Perc_GenderZero',
    'IP_Perc_HasAllPhys',
    'IP_Perc_HasDied',
    'IP_Perc_HasNoPhys',
    'IP_Perc_HasRenalDisease',
    'IP_Perc_HeartFailure_Chronic',
    'IP_Perc_InsReimbursementRatio',
    'IP_Perc_IschemicHeart_Chronic',
    'IP_Perc_KidneyDisease_Chronic',
    'IP_Perc_Mean_AgeAtClaim',
    'IP_Perc_Mean_NumConds',
    'IP_Perc_MultHosp',
    'IP_Perc_No_ProcCode',
    'IP_Perc_ObstrPulmonary_Chronic',
    'IP_Perc_Osteoporosis_Chronic',
    'IP_Perc_RaceOne',
    'IP_Perc_RaceThree',
    'IP_Perc_RaceTwo',
    'IP_Perc_RheumatoidArthritis_Chronic',
    'IP_Perc_Stroke_Chronic',
    'OP_AnnualDeductibleAmt',
    'OP_AnnualReimbursementAmt',
    'OP_Count_UniquePatients',
    'OP_Count_UniqueState',
    'OP_Mean_ClaimCost',
    'OP_Mean_ClaimDuration',
    'OP_Mean_DailyClaimCost',
    'OP_Mean_DeductibleAmtPaid',
    'OP_Mean_InscClaimAmtReimbursed',
    'OP_Mean_NoOfMonths_PartACov',
    'OP_Mean_NoOfMonths_PartBCov',
    'OP_Mean_SumChronicConds',
    'OP_Perc_Alzheimers_Chronic',
    'OP_Perc_Cancer_Chronic',
    'OP_Perc_Depression_Chronic',
    'OP_Perc_Diabetes_Chronic',
    'OP_Perc_Dup',
    'OP_Perc_GenderZero',
    'OP_Perc_HasAllPhys',
    'OP_Perc_HasDied',
    'OP_Perc_HasNoPhys',
    'OP_Perc_HasRenalDisease',
    'OP_Perc_HeartFailure_Chronic',
    'OP_Perc_InsReimbursementRatio',
    'OP_Perc_IschemicHeart_Chronic',
    'OP_Perc_KidneyDisease_Chronic',
    'OP_Perc_Mean_AgeAtClaim',
    'OP_Perc_Mean_NumConds',
    'OP_Perc_MultHosp',
    'OP_Perc_No_DiagCode',
    'OP_Perc_ObstrPulmonary_Chronic',
    'OP_Perc_Osteoporosis_Chronic',
    'OP_Perc_RaceOne',
    'OP_Perc_RaceThree',
    'OP_Perc_RaceTwo',
    'OP_Perc_RheumatoidArthritis_Chronic',
    'OP_Perc_Stroke_Chronic']

## Export

In [26]:
providers.to_csv('./data/Providers_Final.csv')