In [11]:
import numpy as np
import pandas as pd 
import matplotlib as plt 
%matplotlib inline 

In [12]:
## read dataset
benef = pd.read_csv('./Datathon2020data/beneficiary.csv')
inpatient = pd.read_csv('./Datathon2020data/inpatients.csv')
outpatient = pd.read_csv('./Datathon2020data/outpatients.csv')
provider = pd.read_csv('./Datathon2020data/providers.csv')

In [108]:
## merging inpatient and outpatient and label with type
inpatient['Is_inpatient'] = 1
outpatient['Is_inpatient'] = 0
claims = pd.concat([inpatient, outpatient], axis=0)  

# label with fraud info
claims = pd.merge(claims, provider, on='PID')

In [109]:
# add beneficiary info for claims
all_info = pd.merge(claims, benef, on='BID', how='left')

In [110]:
# Convert date columns to data types
date_cols = ['StartDt','EndDt', 'DOB', 'DOD', 'AdmissionDt']
for i in date_cols: 
    all_info[i] = pd.to_datetime(all_info[i])

In [111]:
# New Variables depends on Dates
all_info['NumOfClaimDays'] = (all_info['EndDt'] - all_info['StartDt']).apply(lambda x:x.days)
all_info['Age'] = (all_info['StartDt'] - all_info['DOB']).apply(lambda x:x.days)
all_info['Death_age'] = (all_info['DOD'] - all_info['DOB']).apply(lambda x:(x.days)/365)

In [112]:
# output full df
#all_info.to_csv("./Datathon2020data/full_data.csv", index=False) 

In [113]:
## Drop useless columns - CID, StartDt, EndDt

In [114]:
all_info['DiagnosisCode_Num'] = all_info.loc[:, 'DiagnosisCode_1':'DiagnosisCode_10'].count(axis=1)
all_info['ProcedureCode_Num'] = all_info.loc[:, 'ProcedureCode_1':'ProcedureCode_6'].count(axis=1)

In [115]:
##Convert Yes/No Categories into binary variables

#chronic diseases

diseases = ['Chronic_Alzheimer',
       'Chronic_Heartfailure', 'Chronic_KidneyDisease', 'Chronic_Cancer',
       'Chronic_ObstrPulmonary', 'Chronic_Depression', 'Chronic_Diabetes',
       'Chronic_IschemicHeart', 'Chronic_Osteoporasis',
       'Chronic_rheumatoidarthritis', 'Chronic_stroke']

for i in diseases: 
    all_info[i] = all_info[i]-1

In [116]:
all_info['ChronicDisease_Num'] = all_info.loc[:,'Chronic_Alzheimer':'Chronic_stroke'].sum(axis=1)

In [117]:
all_info['RenalDisease'] = pd.to_numeric(all_info['RenalDisease'].replace('Y', 1))

In [118]:
all_info['Fraud'] = pd.to_numeric(all_info['Fraud'].replace({'Yes':1, 'No':0}))

In [123]:
all_info['AdmitDiagnosInDiagnos'] = ((all_info['AdmitDiagnosisCode'] == all_info['DiagnosisGroupCode']) |
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_1']) |
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_2'])|
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_3'])|
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_4'])|
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_5']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_6']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_7']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_8']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_9']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_10']))

In [124]:
all_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 558211 entries, 0 to 558210
Data columns (total 64 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   BID                               558211 non-null  object        
 1   CID                               558211 non-null  object        
 2   StartDt                           558211 non-null  datetime64[ns]
 3   EndDt                             558211 non-null  datetime64[ns]
 4   PID                               558211 non-null  object        
 5   AmtReimbursed                     558211 non-null  int64         
 6   AttendingPhysician                556703 non-null  object        
 7   OperatingPhysician                114447 non-null  object        
 8   OtherPhysician                    199736 non-null  object        
 9   AdmissionDt                       40474 non-null   datetime64[ns]
 10  AdmitDiagnosisCode              

### Physician Checks 

In [137]:
check = all_info.groupby(['AttendingPhysician']).agg(check1=('Fraud','count'),
                                                     check2=('Fraud',sum))

In [141]:
check['check3'] = check['check1']>check['check2']

In [142]:
check['check3'].sum()

62969

In [None]:
### physician work in provider, provider got flagged, physicians go to new PID 

### Random Checks

In [121]:
all_info[((all_info['AdmitDiagnosisCode'] == all_info['DiagnosisGroupCode']) |
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_1']) |
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_2'])|
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_3'])|
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_4'])|
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_5']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_6']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_7']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_8']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_9']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_10']))]['Fraud'].sum()

8274

In [120]:
all_info[((all_info['AdmitDiagnosisCode'] == all_info['DiagnosisGroupCode']) |
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_1']) |
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_2'])|
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_3'])|
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_4'])|
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_5']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_6']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_7']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_8']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_9']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_10']))]['Is_inpatient'].sum()

9835

In [122]:
len(all_info[((all_info['AdmitDiagnosisCode'] == all_info['DiagnosisGroupCode']) |
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_1']) |
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_2'])|
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_3'])|
(all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_4'])|
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_5']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_6']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_7']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_8']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_9']) |
 (all_info['AdmitDiagnosisCode'] == all_info['DiagnosisCode_10']))])

17082

In [130]:
all_info[all_info['AttendingPhysician']=='PHY330576']['Fraud'].sum()

2534