## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import load

import Functions as fxns
from Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

from sklearn.impute import SimpleImputer

In [2]:
# !python ../Preprocessing.py

In [3]:
claims = load('../claims.pkl')

## Claims DF

### New Columns

In [4]:
claims['AgeAtService'] = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)

claims['HasDied'] = claims.DOD.notna()
fxns.re_encode_bool(claims, ['HasDied'])

chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
claims['NumCondsPerBene'] = claims[chronic_cols].sum(axis=1)

In [5]:
physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
phys_count_cols  = \
    ['HasAllPhysicians', 'HasAnyPhysicians', 'HasNoPhysicians']

claims['HasAllPhysicians'] = claims[physician_cols].notna().all(axis=1)
claims['HasAnyPhysicians'] = claims[physician_cols].notna().any(axis=1)
claims['HasNoPhysicians']  = claims[physician_cols].isna().all(axis=1)

fxns.re_encode_bool(claims, phys_count_cols)

In [6]:
claims['ClaimDuration'] = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['IPDuration']    = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

In [7]:
claims['ClaimCost'] = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['PercInsCovered'] = \
    round((claims.InscClaimAmtReimbursed / claims.ClaimCost) * 100)
claims['DailyClaimCost'] = \
    round(claims.ClaimCost / claims.ClaimDuration)

In [8]:
claim_cols = claims.columns.to_list()
claim_cols.remove('Provider')

### Variables

In [9]:
# lists:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]
physician_cols   = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols   = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols   = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

# dataframes:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

## Providers DF

### Data Import

In [12]:
providers = claims.groupby(['PotentialFraud', 'Provider', 'IsOutpatient'
                           ]).size().reset_index().drop(0, axis=1)

In [13]:
lucas = pd.read_csv('./data/providers.csv')
lucas.drop('percentage_noPhysician', axis=1, inplace=True)
lucas.columns = \
    ['Provider', 'DualPatientType_Perc', 'HasTop5AdmtCode']

In [14]:
ryan = pd.read_csv('./data/Ryan_providers.csv')
ryan.drop('Unnamed: 0', axis=1, inplace=True)
ryan.rename(columns = {'Patient_Attphy_Ratio': 'BenesPerAttPhys',
                'Patient_Operphy_Ratio': 'BenesPerOperPhys',
                'Patient_Otherphy_Ratio': 'BenesPerOthPhys',
                'BeneID_Nunique_IP': 'IP_UniqueBenes_Count',
                'BeneID_Nunique_OP': 'OP_UniqueBenes_Count',
                'State_Nunique_IP': 'IP_UniqueSt_Count',
                'State_Nunique_OP': 'OP_UniqueSt_Count'},
            inplace=True)

In [15]:
providers = providers.merge(lucas).merge(ryan)

### Beneficiaries

In [16]:
providers['GenderZero_Perc'] = \
    claims.groupby(['Provider', 'IsOutpatient']).Gender.mean().values
providers['HasRenalDisease_Perc'] = \
    claims.groupby(['Provider', 'IsOutpatient']).RenalDisease.mean().values
providers['AgeAtService_Mean'] = \
    claims.groupby(['Provider', 'IsOutpatient']).AgeAtService.mean().values
providers['HasDied_Perc'] = \
    claims.groupby(['Provider', 'IsOutpatient']).HasDied.mean().values
providers['NumCondsPerBene_Mean'] = \
    claims.groupby(['Provider', 'IsOutpatient']).NumCondsPerBene.mean().values

In [17]:
for col in chronic_cols:
    colname = f'{col}_Perc'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

# colname2 = f'{col}_Mean'


### Doctors

In [18]:
for col in phys_count_cols:
    colname = f'{col}_Perc'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

### Money

In [19]:
money_cols = \
    ['InscClaimAmtReimbursed', 'DeductibleAmtPaid',
     'PercInsCovered', 'ClaimCost', 'DailyClaimCost',
     'ClaimDuration', 'IPDuration',
     'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
     'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
     'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']

In [20]:
for col in money_cols:
    colname = f'{col}_Perc'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'
                                        ])[col].mean().values

## MISC

In [21]:
providers.sample()

Unnamed: 0,PotentialFraud,Provider,IsOutpatient,DualPatientType_Perc,HasTop5AdmtCode,BenesPerAttPhys,BenesPerOperPhys,BenesPerOthPhys,Att_Phy_Mult,Oper_Phy_Mult,Other_Phy_Mult,IP_Dup_Perc,OP_Dup_Perc,OP_No_Diag_Perc,IP_No_Proc_Perc,In_Top5_St_Perc,IP_UniqueBenes_Count,IP_UniqueSt_Count,OP_UniqueBenes_Count,OP_UniqueSt_Count,GenderZero_Perc,HasRenalDisease_Perc,AgeAtService_Mean,HasDied_Perc,NumCondsPerBene_Mean,Alzheimers_Chronic_Perc,HeartFailure_Chronic_Perc,KidneyDisease_Chronic_Perc,Cancer_Chronic_Perc,ObstrPulmonary_Chronic_Perc,Depression_Chronic_Perc,Diabetes_Chronic_Perc,IschemicHeart_Chronic_Perc,Osteoporosis_Chronic_Perc,RheumatoidArthritis_Chronic_Perc,Stroke_Chronic_Perc,HasAllPhysicians_Perc,HasAnyPhysicians_Perc,HasNoPhysicians_Perc,InscClaimAmtReimbursed_Perc,DeductibleAmtPaid_Perc,PercInsCovered_Perc,ClaimCost_Perc,DailyClaimCost_Perc,ClaimDuration_Perc,IPDuration_Perc,NoOfMonths_PartACov_Perc,NoOfMonths_PartBCov_Perc,IPAnnualReimbursementAmt_Perc,IPAnnualDeductibleAmt_Perc,OPAnnualReimbursementAmt_Perc,OPAnnualDeductibleAmt_Perc
5179,0,PRV56640,1,0.0,0.0,9,19,11,0.26087,0.043478,0.043478,,0.5,0.043478,,0.0,,,23.0,1.0,0.608696,0.434783,74.0,0.0,6.086957,0.652174,0.782609,0.695652,0.391304,0.521739,0.391304,0.73913,0.869565,0.347826,0.391304,0.304348,0.130435,1.0,0.0,8130.434783,1068.0,84.047619,9258.47619,2029.952381,5.956522,5.956522,10.695652,11.565217,17234.347826,2170.608696,1860.434783,537.826087


In [22]:
providers.columns

Index(['PotentialFraud', 'Provider', 'IsOutpatient', 'DualPatientType_Perc',
       'HasTop5AdmtCode', 'BenesPerAttPhys', 'BenesPerOperPhys',
       'BenesPerOthPhys', 'Att_Phy_Mult', 'Oper_Phy_Mult', 'Other_Phy_Mult',
       'IP_Dup_Perc', 'OP_Dup_Perc', 'OP_No_Diag_Perc', 'IP_No_Proc_Perc',
       'In_Top5_St_Perc', 'IP_UniqueBenes_Count', 'IP_UniqueSt_Count',
       'OP_UniqueBenes_Count', 'OP_UniqueSt_Count', 'GenderZero_Perc',
       'HasRenalDisease_Perc', 'AgeAtService_Mean', 'HasDied_Perc',
       'NumCondsPerBene_Mean', 'Alzheimers_Chronic_Perc',
       'HeartFailure_Chronic_Perc', 'KidneyDisease_Chronic_Perc',
       'Cancer_Chronic_Perc', 'ObstrPulmonary_Chronic_Perc',
       'Depression_Chronic_Perc', 'Diabetes_Chronic_Perc',
       'IschemicHeart_Chronic_Perc', 'Osteoporosis_Chronic_Perc',
       'RheumatoidArthritis_Chronic_Perc', 'Stroke_Chronic_Perc',
       'HasAllPhysicians_Perc', 'HasAnyPhysicians_Perc',
       'HasNoPhysicians_Perc', 'InscClaimAmtReimbursed_Perc',
 

In [23]:
desc = ['PotentialFraud', 'Provider', 'IsOutpatient', 'DeductibleAmtPaid_Perc',
       'MISSING3', 'Alzheimers_Chronic_Perc', 'HeartFailure_Chronic_Perc',
       'KidneyDisease_Chronic_Perc', 'Cancer_Chronic_Perc',
       'ObstrPulmonary_Chronic_Perc', 'Depression_Chronic_Perc',
       'Diabetes_Chronic_Perc', 'IschemicHeart_Chronic_Perc',
       'Osteoporosis_Chronic_Perc', 'RheumatoidArthritis_Chronic_Perc',
       'Stroke_Chronic_Perc', 'MISSING4', 'MISSING5', 'Att_Phy_Mult',
       'Oper_Phy_Mult', 'Other_Phy_Mult', 'MISSING6', 'MISSING7',
       'DualPatientType_Perc', 'HasTop5AdmtCode', 'IP_UniqueBenes_Count',
       'OP_UniqueBenes_Count', 'IP_UniqueSt_Count', 'OP_UniqueSt_Count',
       'BenesPerAttPhys', 'BenesPerOperPhys', 'BenesPerOthPhys',
       'HasAllPhysicians_Perc', 'HasAnyPhysicians_Perc',
       'HasNoPhysicians_Perc', 'HasRenalDisease_Perc', 'HasDied_Perc',
       'ClaimDuration_Perc', 'IPDuration_Perc', 'InscClaimAmtReimbursed_Perc',
       'AgeAtService_Mean', 'NumCondsPerBene_Mean', 'PercInsCovered_Perc',
       'ClaimCost_Perc', 'DailyClaimCost_Perc', 'IP_Dup_Perc', 'OP_Dup_Perc',
       'MISSING20', 'MISSING21', 'OP_No_Diag_Perc', 'IP_No_Proc_Perc',
       'In_Top5_St_Perc', 'GenderZero_Perc', 'NoOfMonths_PartACov_Perc',
       'NoOfMonths_PartBCov_Perc', 'IPAnnualReimbursementAmt_Perc',
       'IPAnnualDeductibleAmt_Perc', 'OPAnnualReimbursementAmt_Perc',
       'OPAnnualDeductibleAmt_Perc']

colst = providers.columns.to_list()

[i for i in colst + desc if i not in desc]

[]