## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import load

import Functions as fxns
from Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

from sklearn.impute import SimpleImputer

In [2]:
# !python ../Preprocessing.py

In [3]:
claims = load('../claims.pkl')

## Claims DF

### New Columns

In [4]:
claims['AgeAtService'] = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)

claims['HasDied'] = claims.DOD.notna()
fxns.re_encode_bool(claims, ['HasDied'])

In [5]:
physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
phys_count_cols  = ['HasAllPhysicians', 'HasAnyPhysicians', 'HasNoPhysicians']

claims['HasAllPhysicians'] = claims[physician_cols].notna().all(axis=1)
claims['HasAnyPhysicians'] = claims[physician_cols].notna().any(axis=1)
claims['HasNoPhysicians']  = claims[physician_cols].isna().all(axis=1)

fxns.re_encode_bool(claims, phys_count_cols)

In [6]:
claims['ClaimDuration'] = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['IPDuration']    = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

In [7]:
claims['ClaimCost'] = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['PercInsCovered'] = \
    round((claims.InscClaimAmtReimbursed / claims.ClaimCost) * 100)
claims['DailyClaimCost'] = \
    round(claims.ClaimCost / claims.ClaimDuration)

In [8]:
claim_cols = claims.columns.to_list()
claim_cols.remove('Provider')

### Variables

In [9]:
# lists:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]
physician_cols   = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols   = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols   = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

# dataframes:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

## Providers DF

### Data Import

In [10]:
providers = claims.groupby(['PotentialFraud', 'Provider', 'IsOutpatient'
                           ]).size().reset_index().drop(0, axis=1)

In [11]:
lucas = pd.read_csv('./data/providers.csv')
lucas.columns = ['Provider', 'DualPatientType_Perc', 'HasNoPhys_Perc', 'HasTop5AdmtCode']

In [12]:
ryan = pd.read_csv('./data/Ryan_providers.csv')
ryan.drop('Unnamed: 0', axis=1, inplace=True)
ryan.rename(columns = {'Patient_Attphy_Ratio': 'BenesPerAttPhys',
                'Patient_Operphy_Ratio': 'BenesPerOperPhys',
                'Patient_Otherphy_Ratio': 'BenesPerOthPhys',
                'BeneID_Nunique_IP': 'IP_UniqueBenes_Count',
                'BeneID_Nunique_OP': 'OP_UniqueBenes_Count',
                'State_Nunique_IP': 'IP_UniqueSt_Count',
                'State_Nunique_OP': 'OP_UniqueSt_Count'}, inplace=True)

In [13]:
providers = providers.merge(lucas).merge(ryan)

### Beneficiaries

In [14]:
providers['GenderZero_Perc'] = claims.groupby(['Provider', 'IsOutpatient']).Gender.mean().values
providers['HasRenalDisease_Perc'] = claims.groupby(['Provider', 'IsOutpatient']).RenalDisease.mean().values
providers['HasDied_Perc'] = claims.groupby(['Provider', 'IsOutpatient']).HasDied.mean().values

In [15]:
for col in chronic_cols:
    colname = f'{col}_Perc'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'])[col].mean().values

### Doctors

In [16]:
for col in phys_count_cols:
    colname = f'{col}_Perc'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'])[col].mean().values

### Money

In [19]:
money_cols = \
    ['InscClaimAmtReimbursed', 'DeductibleAmtPaid',
     'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'AgeAtService',
     'HasDied', 'PercInsCovered',
     'HasAllPhysicians', 'HasAnyPhysicians', 'HasNoPhysicians',
     'ClaimDuration', 'IPDuration', 'ClaimCost', 'DailyClaimCost',
     'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
     'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']

In [20]:
for col in money_cols:
    colname = f'{col}_Perc'
    providers[colname] = claims.groupby(['Provider', 'IsOutpatient'])[col].mean().values

## MISC

In [21]:
providers.head(12)

Unnamed: 0,PotentialFraud,Provider,IsOutpatient,DualPatientType_Perc,HasNoPhys_Perc,HasTop5AdmtCode,BenesPerAttPhys,BenesPerOperPhys,BenesPerOthPhys,Att_Phy_Mult,Oper_Phy_Mult,Other_Phy_Mult,IP_Dup_Perc,OP_Dup_Perc,OP_No_Diag_Perc,IP_No_Proc_Perc,In_Top5_St_Perc,IP_UniqueBenes_Count,IP_UniqueSt_Count,OP_UniqueBenes_Count,OP_UniqueSt_Count,GenderZero_Perc,HasRenalDisease_Perc,HasDied_Perc,Alzheimers_Chronic_Perc,HeartFailure_Chronic_Perc,KidneyDisease_Chronic_Perc,Cancer_Chronic_Perc,ObstrPulmonary_Chronic_Perc,Depression_Chronic_Perc,Diabetes_Chronic_Perc,IschemicHeart_Chronic_Perc,Osteoporosis_Chronic_Perc,RheumatoidArthritis_Chronic_Perc,Stroke_Chronic_Perc,HasAllPhysicians_Perc,HasAnyPhysicians_Perc,HasNoPhysicians_Perc,InscClaimAmtReimbursed_Perc,DeductibleAmtPaid_Perc,NoOfMonths_PartACov_Perc,NoOfMonths_PartBCov_Perc,AgeAtService_Perc,PercInsCovered_Perc,ClaimDuration_Perc,ClaimCost_Perc,DailyClaimCost_Perc,IPAnnualReimbursementAmt_Perc,IPAnnualDeductibleAmt_Perc,OPAnnualReimbursementAmt_Perc,OPAnnualDeductibleAmt_Perc,IPDuration_Perc
0,0,PRV51001,0,0.04,,0.04,10,19,16,0.24,0.04,0.0,0.0,0.55,0.0,0.6,0.0,5.0,1.0,19.0,1.0,0.4,0.4,0.0,0.4,0.8,0.8,0.2,0.4,0.8,0.8,0.8,0.0,0.6,0.4,0.2,1.0,0.0,19400.0,1068.0,12.0,12.0,77.6,88.0,6.0,20468.0,4077.4,77902.0,2563.2,1350.0,236.0,6.0
1,0,PRV51001,1,0.04,,0.04,10,19,16,0.24,0.04,0.0,0.0,0.55,0.0,0.6,0.0,5.0,1.0,19.0,1.0,0.35,0.3,0.0,0.65,0.75,0.65,0.2,0.4,0.25,0.85,0.95,0.3,0.25,0.2,0.05,1.0,0.0,382.0,0.0,12.0,12.0,77.95,100.0,1.55,382.0,307.0,2532.0,480.6,2931.5,520.9,
2,0,PRV51004,1,0.013423,,0.013423,100,119,112,0.167785,0.013423,0.0,,0.461538,0.040268,,0.0,,,138.0,9.0,0.33871,0.274194,0.016129,0.516129,0.580645,0.629032,0.112903,0.370968,0.403226,0.790323,0.887097,0.209677,0.306452,0.112903,0.0,1.0,0.0,9241.935484,1068.0,11.806452,11.806452,69.935484,82.048387,6.16129,10309.935484,2385.064516,12696.612903,1604.064516,2217.741935,724.83871,6.16129
3,0,PRV51007,0,0.027778,,0.027778,48,53,51,0.597222,0.083333,0.027778,0.0,0.42029,0.0,0.666667,0.0,3.0,1.0,56.0,2.0,0.471429,0.171429,0.0,0.342857,0.628571,0.357143,0.042857,0.257143,0.414286,0.728571,0.814286,0.285714,0.271429,0.071429,0.057143,1.0,0.0,466.714286,1.0,11.828571,11.928571,68.371429,99.405797,3.357143,467.714286,336.428571,3025.857143,335.657143,3086.0,748.0,
4,0,PRV51007,1,0.027778,,0.027778,48,53,51,0.597222,0.083333,0.027778,0.0,0.42029,0.0,0.666667,0.0,3.0,1.0,56.0,2.0,0.308725,0.154362,0.006711,0.42953,0.590604,0.33557,0.107383,0.275168,0.422819,0.704698,0.724832,0.328859,0.308725,0.114094,0.080537,1.0,0.0,350.134228,2.080537,11.865772,11.959732,71.302013,97.847222,2.42953,352.214765,250.362416,4351.879195,434.95302,2194.899329,622.751678,
5,0,PRV51008,0,0.0,,0.0,26,31,31,0.255814,0.046512,0.093023,0.0,0.35,0.02439,0.0,0.0,2.0,1.0,34.0,2.0,0.438627,0.222318,0.003433,0.365665,0.583691,0.435193,0.141631,0.253219,0.416309,0.685837,0.76824,0.295279,0.28412,0.106438,0.08412,0.998283,0.001717,241.124464,3.175966,11.907296,11.939914,69.567382,98.073386,2.088412,244.300429,196.533906,3623.991416,379.162232,2109.733906,636.328755,
6,0,PRV51008,1,0.0,,0.0,26,31,31,0.255814,0.046512,0.093023,0.0,0.35,0.02439,0.0,0.0,2.0,1.0,34.0,2.0,0.333333,0.333333,0.0,0.666667,1.0,0.333333,0.0,0.0,0.666667,1.0,1.0,0.0,0.333333,0.666667,0.0,1.0,0.0,6333.333333,1068.0,12.0,12.0,78.0,83.0,6.333333,7401.333333,1255.666667,11710.0,2136.0,2413.333333,470.0,6.333333
7,0,PRV51011,0,0.017241,,0.017241,25,43,38,0.586207,0.068966,0.068966,0.0,0.423077,0.087719,1.0,0.0,1.0,1.0,52.0,1.0,0.478261,0.144928,0.014493,0.347826,0.536232,0.304348,0.173913,0.231884,0.391304,0.666667,0.695652,0.304348,0.304348,0.144928,0.115942,1.0,0.0,213.188406,0.869565,11.826087,11.826087,67.956522,99.220588,1.768116,214.057971,199.695652,2673.478261,371.478261,1700.0,469.710145,
8,0,PRV51011,1,0.017241,,0.017241,25,43,38,0.586207,0.068966,0.068966,0.0,0.423077,0.087719,1.0,0.0,1.0,1.0,52.0,1.0,0.5,0.0,0.0,0.5,0.0,0.5,0.5,0.5,0.0,0.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,12500.0,1068.0,12.0,12.0,50.5,87.0,5.0,13568.0,3181.0,18750.0,1602.0,320.0,165.0,5.0
9,0,PRV51012,1,0.041667,0.020833,0.041667,17,28,18,0.5625,0.0,0.125,,0.416667,0.0,,0.0,,,31.0,3.0,0.439024,0.243902,0.0,0.390244,0.609756,0.317073,0.195122,0.195122,0.292683,0.707317,0.756098,0.268293,0.146341,0.04878,0.04878,1.0,0.0,259.268293,4.390244,12.0,12.0,75.829268,99.05,2.414634,263.658537,174.341463,5999.02439,573.073171,2680.243902,638.04878,
