## Setup and Data Import

In [15]:
import sys
sys.path.insert(0, '..')

from joblib import load

import Sita_Functions as fxns
from Sita_Functions import np, pd

pd.set_option('display.max_columns', None)

import plotly.express as px

import statsmodels.api as sm

In [2]:
# !python ../Sita_Preprocessing.py

In [3]:
claims = load('claims.pkl')

In [4]:
# claims.sample(10)

In [5]:
# fxns.explore_df(claims)

In [6]:
numeric_cols = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols = fxns.cols_by_dtype(claims)[2]

## Diagnosis and Procedure Codes

### Specific Code to PotentialFraud

In [7]:
diagnosis_cols = claims.columns[
    claims.columns.str.contains('ClmDiagnosis')].to_list()
procedure_cols = claims.columns[
    claims.columns.str.contains('Procedure')].to_list()

In [8]:
# px.scatter(claims, 'ClmDiagnosisCode_3', 'PotentialFraud')

# The diagnosis codes with the least missingness (1, 2, and 3) don't seem to have
# a relationship with Potential Fraud

In [9]:
# px.scatter(claims, 'ClmProcedureCode_3', 'PotentialFraud')

# The procedure codes all have missingness above 95% and don't seem to have
# a relationship with Potential Fraud

### Code Quantity by Provider to PotentialFraud

In [10]:
counts_by_provider = claims.groupby('Provider').count()

In [11]:
# px.scatter(counts_by_provider[diagnosis_cols].sum(axis=1))

In [12]:
# px.scatter(counts_by_provider[procedure_cols].sum(axis=1))

### Consolidating Diagnosis and Procedure Codes

In [13]:
# fxns.dummify(claims)
# fxns.consolidate(claims)

## Feature Engineering

In [14]:
counts_by_provider

Unnamed: 0_level_0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,IsOutpatient,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1
PRV51001,25,25,25,25,25,25,5,10,5,11,25,5,5,24,16,10,8,8,8,3,2,1,0,2,1,0,0,0,0,25,25,0,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25
PRV51003,132,132,132,132,132,132,45,25,62,78,132,62,62,132,112,92,78,66,58,55,50,45,5,39,9,0,0,0,0,132,132,1,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132
PRV51004,149,149,149,149,149,149,27,63,0,28,149,0,0,143,91,56,34,24,16,9,8,4,0,0,0,0,0,0,0,149,149,1,149,149,149,149,149,149,149,149,149,149,149,149,149,149,149,149,149,149,149,149,149,149,149
PRV51005,1165,1165,1165,1165,1165,1163,222,478,0,252,1165,0,0,1149,723,455,287,164,100,68,43,26,1,0,0,0,0,0,0,1165,1165,4,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165
PRV51007,72,72,72,72,72,72,12,26,3,17,72,3,3,72,46,29,21,14,12,9,6,5,1,1,0,0,0,0,0,72,72,1,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PRV57759,28,28,28,28,28,28,1,12,0,3,28,0,0,27,16,8,5,2,0,0,0,0,0,0,0,0,0,0,0,28,28,0,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28
PRV57760,22,22,22,22,22,22,6,9,0,7,22,0,0,22,11,8,5,3,2,1,0,0,0,0,0,0,0,0,0,22,22,0,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22
PRV57761,82,82,82,82,82,82,14,36,0,18,82,0,0,82,55,32,23,11,8,4,2,2,0,0,0,0,0,0,0,82,82,1,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82
PRV57762,1,1,1,1,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
