## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import load

import Functions as fxns
from Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

In [2]:
# !python ../Preprocessing.py

In [3]:
claims = load('../claims.pkl')

## Claims DF

### New Columns

In [4]:
claims['AgeAtService'] = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)

claims['HasDied'] = claims.DOD.notna()
fxns.re_encode_bool(claims, ['HasDied'])

chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
claims['NumCondsPerBene'] = claims[chronic_cols].sum(axis=1)

In [5]:
physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
phys_count_cols  = \
    ['HasAllPhysicians', 'HasAnyPhysicians', 'HasNoPhysicians']

claims['HasAllPhysicians'] = claims[physician_cols].notna().all(axis=1)
claims['HasAnyPhysicians'] = claims[physician_cols].notna().any(axis=1)
claims['HasNoPhysicians']  = claims[physician_cols].isna().all(axis=1)

fxns.re_encode_bool(claims, phys_count_cols)

In [6]:
claims['ClaimDuration'] = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['IPDuration']    = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

In [7]:
claims['ClaimCost'] = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['PercInsCovered'] = \
    round((claims.InscClaimAmtReimbursed / claims.ClaimCost) * 100)
claims['DailyClaimCost'] = \
    round(claims.ClaimCost / claims.ClaimDuration)

### Variables

In [8]:
# lists:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]
physician_cols   = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols   = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols   = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

# dataframes:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

## Providers DF

### Data Import

In [9]:
providers = claims.groupby(['PotentialFraud', 'Provider', 'IsOutpatient'
                           ]).size().reset_index().drop(0, axis=1)

In [10]:
lucas = pd.read_csv('./data/Lucas_Providers.csv')
lucas.drop('percentage_noPhysician', axis=1, inplace=True)
lucas.columns = \
    ['Provider', 'DualPatientType_Perc', 'HasTop5AdmtCode']

In [11]:
ryan = pd.read_csv('./data/Ryan_Providers.csv')
ryan.drop(['Unnamed: 0', 'AllPhy_mean_IP', 'AllPhy_mean_OP',
           'NoPhy_mean_IP', 'NoPhy_mean_OP',
           'ClaimDuration_mean_IP', 'ClaimDuration_mean_OP',
           'InscClaimAmtReimbursed_mean_IP', 'InscClaimAmtReimbursed_mean_OP',
           'AdmisDuration_mean_IP', 'AdmisDuration_mean_OP',
           'AgeAtClm_mean_IP', 'AgeAtClm_mean_OP',
           'DeductibleAmtPaid_mean_IP', 'DeductibleAmtPaid_mean_OP',
           'InsCovRatio_mean_IP', 'InsCovRatio_mean_OP',
           'RevPerDay_mean_IP', 'RevPerDay_mean_OP',
           'Bene_Receive_Both_IO_Perc'],
          axis=1, inplace=True)
ryan.rename(columns = {'Patient_Attphy_Ratio': 'PatientsPerAttPhys',
                'Patient_Operphy_Ratio': 'PatientsPerOperPhys',
                'Patient_Otherphy_Ratio': 'PatientsPerOthPhys',
                'BeneID_Nunique_IP': 'UniquePatients_IP_Count',
                'BeneID_Nunique_OP': 'UniquePatients_OP_Count',
                'State_Nunique_IP': 'UniqueSt_Count_IP',
                'State_Nunique_OP': 'UniqueSt_Count_OP',
                'Claim_Patient_Ratio': 'ClaimsPerPatient_Ratio',
                'Claim_AttPhy_Ratio': 'ClaimsPerAttPhys_Ratio',
                'IsOutpatient_Perc': 'Outpatient_Perc',
                'Chronic_Sum_mean_IP': 'SumChronicConds_IP_Mean',
                'Chronic_Sum_mean_OP': 'SumChronicConds_OP_Mean',
                'Att_Phy_Mult_Prec': 'AttPhysMultHosp_Perc',
                'Oper_Phy_Mult_Prec': 'OperPhysMultHosp_Perc',
                'Other_Phy_Mult_Prec': 'OtherPhysMultHosp_Perc',
                'Provider_Serve_BothIO': 'DualPatientProvider_Perc',
                'In_Top5_State_Perc': 'ClaimsPerTopFraudSt_Perc',
                'IP_Multiple_Hospital_Prec': 'MultHosp_IP_Perc',
                'OP_Multiple_Hospital_Prec': 'MultHosp_OP_Perc'},
            inplace=True)
print(ryan.columns)

Index(['Provider', 'PatientsPerAttPhys', 'PatientsPerOperPhys',
       'PatientsPerOthPhys', 'ClaimsPerPatient_Ratio',
       'ClaimsPerAttPhys_Ratio', 'Outpatient_Perc', 'UniquePatients_IP_Count',
       'UniqueSt_Count_IP', 'UniquePatients_OP_Count', 'UniqueSt_Count_OP',
       'SumChronicConds_IP_Mean', 'SumChronicConds_OP_Mean',
       'AttPhysMultHosp_Perc', 'OperPhysMultHosp_Perc',
       'OtherPhysMultHosp_Perc', 'MultHosp_IP_Perc', 'MultHosp_OP_Perc',
       'DualPatientProvider_Perc', 'IP_Dup_Perc', 'OP_Dup_Perc',
       'OP_No_Diag_Perc', 'IP_No_Proc_Perc', 'ClaimsPerTopFraudSt_Perc',
       'PotentialFraud'],
      dtype='object')


In [12]:
providers = providers.merge(lucas).merge(ryan)

### Beneficiaries

In [13]:
providers['GenderZero_Perc'] = \
    claims.groupby(['Provider', 'IsOutpatient']).Gender.mean().values
providers['HasRenalDisease_Perc'] = \
    claims.groupby(['Provider', 'IsOutpatient']).RenalDisease.mean().values
providers['AgeAtService_Mean'] = \
    claims.groupby(['Provider', 'IsOutpatient']).AgeAtService.mean().values
providers['HasDied_Perc'] = \
    claims.groupby(['Provider', 'IsOutpatient']).HasDied.mean().values
providers['NumCondsPerBene_Mean'] = \
    claims.groupby(['Provider', 'IsOutpatient']).NumCondsPerBene.mean().values

In [14]:
for col in chronic_cols:
    colname = f'{col}_Perc'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

### Doctors

In [15]:
for col in phys_count_cols:
    colname = f'{col}_Perc'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

### Money

In [16]:
money_cols = \
    ['InscClaimAmtReimbursed', 'DeductibleAmtPaid',
     'PercInsCovered', 'ClaimCost', 'DailyClaimCost',
     'ClaimDuration', 'IPDuration',
     'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
     'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
     'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']

In [17]:
for col in money_cols:
    colname = f'{col}_Perc'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

## Pivot

In [18]:
ip = providers[providers.IsOutpatient == 0].add_prefix('IP_')
ip.rename(columns={'IP_Provider': 'Provider',
                   'IP_PotentialFraud': 'PotentialFraud'},
          inplace=True)
op = providers[providers.IsOutpatient == 1].add_prefix('OP_')
op.rename(columns={'OP_Provider': 'Provider',
                   'OP_PotentialFraud': 'PotentialFraud'},
          inplace=True)
providers = pd.merge(ip, op, on=['Provider', 'PotentialFraud'], how='outer')

## Export

In [19]:
providers.fillna(0, inplace=True)

In [21]:
# providers.to_csv('./data/Providers_Final.csv')