In [11]:
import numpy as np
import pandas as pd 
import matplotlib as plt 
%matplotlib inline 

In [12]:
## read dataset
benef = pd.read_csv('./Datathon2020data/beneficiary.csv')
inpatient = pd.read_csv('./Datathon2020data/inpatients.csv')
outpatient = pd.read_csv('./Datathon2020data/outpatients.csv')
provider = pd.read_csv('./Datathon2020data/providers.csv')

In [13]:
## need to check StartDt and EndDt consistensies 

## merging inpatient and outpatient and label with type
inpatient['patient_type'] = 'inpatient'
outpatient['patient_type'] = 'outpatient'

all_patients = pd.concat([inpatient, outpatient], axis=0)  

In [14]:
# label with fraud info
all_patients_labeled = pd.merge(all_patients, provider, on='PID')

In [15]:
# add beneficiary info
all_info = pd.merge(all_patients_labeled, benef, on='BID')

In [20]:
# Convert date columns to data types
date_cols = ['StartDt','EndDt', 'DOB', 'DOD', 'AdmissionDt']
for i in date_cols: 
    all_info[i] = pd.to_datetime(all_info[i])

In [26]:
today = pd.to_datetime('2020-10-15')

In [27]:
today

Timestamp('2020-10-15 00:00:00')

In [41]:
# New Variables depends on Dates
all_info['NumOfClaimDays'] = (all_info['EndDt'] - all_info['StartDt']).apply(lambda x:x.days)
all_info['Age'] = all_info['DOB'].apply(lambda x:((today-x).days)/365)
all_info['Death_age'] = (all_info['DOD'] - all_info['DOB']).apply(lambda x:(x.days)/365)

In [42]:
all_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 558211 entries, 0 to 558210
Data columns (total 59 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   BID                               558211 non-null  object        
 1   CID                               558211 non-null  object        
 2   StartDt                           558211 non-null  datetime64[ns]
 3   EndDt                             558211 non-null  datetime64[ns]
 4   PID                               558211 non-null  object        
 5   AmtReimbursed                     558211 non-null  int64         
 6   AttendingPhysician                556703 non-null  object        
 7   OperatingPhysician                114447 non-null  object        
 8   OtherPhysician                    199736 non-null  object        
 9   AdmissionDt                       40474 non-null   datetime64[ns]
 10  AdmitDiagnosisCode              

In [43]:
all_info[['Age', 'Death_age']].describe()

Unnamed: 0,Age,Death_age
count,558211.0,4131.0
mean,84.663017,74.732221
std,13.024335,11.100341
min,36.89863,36.106849
25%,78.926027,68.380822
50%,85.517808,75.219178
75%,93.356164,82.515068
max,111.863014,100.320548


In [49]:
all_info['Fraud'].value_counts()

No     345415
Yes    212796
Name: Fraud, dtype: int64

In [50]:
# output full df
all_info.to_csv("./Datathon2020data/full_data.csv", index=False) 

### Things to do:

#### cross link the files 
* merge inpatient, outpatient ----- **done**
* merge abover with providers by PID -----  **done**
* benef and above using BID ----- **done**

#### converting vars: (maybe)
* benef:
    * renal disease into binary 
    * chronic_x into binary
 
* provider:
    * fraud into binary


#### Look into Fraud data - see if there are patterns

#### Clustering...


--

# random checks 

In [4]:
benef.describe(exclude=np.number)

Unnamed: 0,BID,DOB,DOD,RenalDisease
count,138556,138556,1421,138556
unique,138556,900,11,2
top,BENE57416,1939-10-01,2009-12-01,0
freq,1,540,182,118978


In [15]:
benef.columns

Index(['BID', 'DOB', 'DOD', 'Gender', 'Race', 'RenalDisease', 'State',
       'County', 'NumOfMonths_PartACov', 'NumOfMonths_PartBCov',
       'Chronic_Alzheimer', 'Chronic_Heartfailure', 'Chronic_KidneyDisease',
       'Chronic_Cancer', 'Chronic_ObstrPulmonary', 'Chronic_Depression',
       'Chronic_Diabetes', 'Chronic_IschemicHeart', 'Chronic_Osteoporasis',
       'Chronic_rheumatoidarthritis', 'Chronic_stroke',
       'InpatientAnnualReimbursementAmt', 'InpatientAnnualDeductibleAmt',
       'OutpatientAnnualReimbursementAmt', 'OutpatientAnnualDeductibleAmt'],
      dtype='object')

In [10]:
inpatient.describe(exclude=np.number)

Unnamed: 0,BID,CID,StartDt,EndDt,PID,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,AdmitDiagnosisCode,...,DiagnosisCode_1,DiagnosisCode_2,DiagnosisCode_3,DiagnosisCode_4,DiagnosisCode_5,DiagnosisCode_6,DiagnosisCode_7,DiagnosisCode_8,DiagnosisCode_9,DiagnosisCode_10
count,40474,40474,40474,40474,40474,40362,23830,4690,40474,40474,...,40474,40248,39798,38940,37580,35636,33216,30532,26977,3927
unique,31289,40474,398,365,2092,11604,8287,2877,398,1928,...,2254,2439,2427,2441,2374,2358,2310,2243,2094,952
top,BENE134170,CLM42571,2009-02-10,2009-02-11,PRV52019,PHY422134,PHY429430,PHY416093,2009-02-10,78650,...,486,4019,4019,4019,4019,4019,4019,4019,4019,4019
freq,8,1,145,153,516,386,225,81,144,1731,...,1499,2484,2282,2100,1889,1620,1402,1200,965,128


In [9]:
outpatient.describe(exclude=np.number)

Unnamed: 0,BID,CID,StartDt,EndDt,PID,AttendingPhysician,OperatingPhysician,OtherPhysician,DiagnosisCode_1,DiagnosisCode_2,DiagnosisCode_3,DiagnosisCode_4,DiagnosisCode_5,DiagnosisCode_6,DiagnosisCode_7,DiagnosisCode_8,DiagnosisCode_9,DiagnosisCode_10,AdmitDiagnosisCode
count,517737,517737,517737,517737,517737,516341,90617,195046,507284,322357,203257,125596,74344,48756,32961,22912,14838,1083,105425
unique,133980,517737,385,366,5012,74109,28532,44388,10354,5056,4448,3925,3412,2968,2635,2260,1894,495,3715
top,BENE118316,CLM393639,2009-03-03,2009-03-03,PRV51459,PHY330576,PHY330576,PHY412132,4019,4019,4019,4019,4019,4019,4019,4019,4019,4019,V7612
freq,29,1,1574,1563,8240,2534,424,1247,13803,19894,12126,7088,4116,2550,1612,1057,616,41,4074


In [11]:
provider.describe()

Unnamed: 0,PID,Fraud
count,5410,5410
unique,5410,2
top,PRV56933,No
freq,1,4904


In [13]:
provider[provider['PID']=='PRV51459']

Unnamed: 0,PID,Fraud
363,PRV51459,Yes


In [14]:
outpatient[outpatient['PID']=='PRV51459']

Unnamed: 0,BID,CID,StartDt,EndDt,PID,AmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,DiagnosisCode_1,...,DiagnosisCode_9,DiagnosisCode_10,ProcedureCode_1,ProcedureCode_2,ProcedureCode_3,ProcedureCode_4,ProcedureCode_5,ProcedureCode_6,DeductibleAmt,AdmitDiagnosisCode
96,BENE11023,CLM516020,2009-08-09,2009-08-09,PRV51459,60,PHY357120,,,4414,...,,,,,,,,,0,4414
97,BENE11023,CLM591356,2009-09-21,2009-09-21,PRV51459,100,PHY338032,,PHY338032,2720,...,,,,,,,,,0,
98,BENE11023,CLM613474,2009-10-04,2009-10-04,PRV51459,10,PHY327046,,PHY341578,4011,...,,,,,,,,,0,
99,BENE11023,CLM740365,2009-12-22,2009-12-22,PRV51459,40,PHY327046,,PHY341578,2722,...,,,,,,,,,0,
100,BENE11023,CLM744555,2009-12-25,2009-12-25,PRV51459,70,PHY314027,,PHY337425,4019,...,,,,,,,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517265,BENE159078,CLM714286,2009-12-04,2009-12-07,PRV51459,30,PHY341578,,,5600,...,,,,,,,,,0,
517460,BENE159125,CLM697465,2009-11-24,2009-11-24,PRV51459,50,PHY341578,PHY341578,PHY341578,2940,...,,,,,,,,,0,3310
517598,BENE159166,CLM345904,2009-05-07,2009-05-07,PRV51459,60,PHY327046,,PHY341578,4660,...,,,,,,,,,100,
517610,BENE159169,CLM201892,2009-02-18,2009-02-18,PRV51459,50,PHY423534,,,3320,...,,,,,,,,,0,
