## Setup and Data Import

In [1]:
from datetime import datetime

import numpy as np

import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
beneficiary = pd.read_csv(
    '../data/Train_Beneficiarydata-1542865627584.csv')
inpatient =  pd.read_csv(
    '../data/Train_Inpatientdata-1542865627584.csv')
outpatient =  pd.read_csv(
    '../data/Train_Outpatientdata-1542865627584.csv')
target = pd.read_csv('../data/Train-1542865627584.csv')

In [3]:
def explore_df(df):
    print('Shape:', df.shape, '\n')
    print('Columns/Dtypes:\n', df.dtypes, '\n')
    print('Missingness:\n', df.isnull().any(
        )[df.isnull().any()], '\n')

In [4]:
explore_df(beneficiary)

Shape: (138556, 25) 

Columns/Dtypes:
 BeneID                             object
DOB                                object
DOD                                object
Gender                              int64
Race                                int64
RenalDiseaseIndicator              object
State                               int64
County                              int64
NoOfMonths_PartACov                 int64
NoOfMonths_PartBCov                 int64
ChronicCond_Alzheimer               int64
ChronicCond_Heartfailure            int64
ChronicCond_KidneyDisease           int64
ChronicCond_Cancer                  int64
ChronicCond_ObstrPulmonary          int64
ChronicCond_Depression              int64
ChronicCond_Diabetes                int64
ChronicCond_IschemicHeart           int64
ChronicCond_Osteoporasis            int64
ChronicCond_rheumatoidarthritis     int64
ChronicCond_stroke                  int64
IPAnnualReimbursementAmt            int64
IPAnnualDeductibleAmt               i

In [5]:
explore_df(inpatient)

Shape: (40474, 30) 

Columns/Dtypes:
 BeneID                     object
ClaimID                    object
ClaimStartDt               object
ClaimEndDt                 object
Provider                   object
InscClaimAmtReimbursed      int64
AttendingPhysician         object
OperatingPhysician         object
OtherPhysician             object
AdmissionDt                object
ClmAdmitDiagnosisCode      object
DeductibleAmtPaid         float64
DischargeDt                object
DiagnosisGroupCode         object
ClmDiagnosisCode_1         object
ClmDiagnosisCode_2         object
ClmDiagnosisCode_3         object
ClmDiagnosisCode_4         object
ClmDiagnosisCode_5         object
ClmDiagnosisCode_6         object
ClmDiagnosisCode_7         object
ClmDiagnosisCode_8         object
ClmDiagnosisCode_9         object
ClmDiagnosisCode_10        object
ClmProcedureCode_1        float64
ClmProcedureCode_2        float64
ClmProcedureCode_3        float64
ClmProcedureCode_4        float64
ClmProcedu

In [6]:
explore_df(outpatient)

Shape: (517737, 27) 

Columns/Dtypes:
 BeneID                     object
ClaimID                    object
ClaimStartDt               object
ClaimEndDt                 object
Provider                   object
InscClaimAmtReimbursed      int64
AttendingPhysician         object
OperatingPhysician         object
OtherPhysician             object
ClmDiagnosisCode_1         object
ClmDiagnosisCode_2         object
ClmDiagnosisCode_3         object
ClmDiagnosisCode_4         object
ClmDiagnosisCode_5         object
ClmDiagnosisCode_6         object
ClmDiagnosisCode_7         object
ClmDiagnosisCode_8         object
ClmDiagnosisCode_9         object
ClmDiagnosisCode_10        object
ClmProcedureCode_1        float64
ClmProcedureCode_2        float64
ClmProcedureCode_3        float64
ClmProcedureCode_4        float64
ClmProcedureCode_5        float64
ClmProcedureCode_6        float64
DeductibleAmtPaid           int64
ClmAdmitDiagnosisCode      object
dtype: object 

Missingness:
 AttendingPhys

In [7]:
explore_df(target)

Shape: (5410, 2) 

Columns/Dtypes:
 Provider          object
PotentialFraud    object
dtype: object 

Missingness:
 Series([], dtype: bool) 



## Pre-Processing

### Column Types

In [8]:
def date_parser(df, cols):
    for col in cols:
        df[col] = pd.to_datetime(df[col])

date_parser(beneficiary, ['DOB', 'DOD'])
date_parser(inpatient, ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt'])
date_parser(outpatient, ['ClaimStartDt', 'ClaimEndDt'])

In [9]:
def check_obj_dtypes(*dfs):
    for df in dfs:
        object_cols = df.select_dtypes('object').columns.tolist()

        for col in object_cols:
            val_counts = df[col].apply(type).value_counts()
            if len(val_counts) > 1:
                print(f'{"-" * 40}\n', val_counts, f'\n{"-" * 40}\n')
            else:
                print(val_counts, '\n')

# Highlighted columns all contain two dtypes because they have NaNs

In [10]:
def convert_cat_dtype(*dfs):
    for df in dfs:
        cols = df.columns[df.columns.str.contains('Procedure')
                         | df.columns.str.contains('Chronic')
                         | df.columns.str.contains('Gender')
                         | df.columns.str.contains('Race')
                         | df.columns.str.contains('RenalDiseaseIndicator')
                         | df.columns.str.contains('State')
                         | df.columns.str.contains('County')].to_list()

        df[cols] = \
            df[cols].apply(lambda x: x.astype('category'))

convert_cat_dtype(beneficiary, inpatient, outpatient)

In [11]:
beneficiary.dtypes

BeneID                                     object
DOB                                datetime64[ns]
DOD                                datetime64[ns]
Gender                                   category
Race                                     category
RenalDiseaseIndicator                    category
State                                    category
County                                   category
NoOfMonths_PartACov                         int64
NoOfMonths_PartBCov                         int64
ChronicCond_Alzheimer                    category
ChronicCond_Heartfailure                 category
ChronicCond_KidneyDisease                category
ChronicCond_Cancer                       category
ChronicCond_ObstrPulmonary               category
ChronicCond_Depression                   category
ChronicCond_Diabetes                     category
ChronicCond_IschemicHeart                category
ChronicCond_Osteoporasis                 category
ChronicCond_rheumatoidarthritis          category


In [12]:
def col_types(df):
    numeric_cols = df.select_dtypes(np.number)
    categorical_cols = df.select_dtypes(['object', 'category'])
    numeric_cols = numeric_cols.columns.to_list()
    categorical_cols = categorical_cols.columns.to_list()
    return numeric_cols, categorical_cols

In [13]:
beneficiary_num_cols = col_types(beneficiary)[0]
beneficiary_cat_cols = col_types(beneficiary)[1]

inpatient_num_cols = col_types(inpatient)[0]
inpatient_cat_cols = col_types(inpatient)[1]

outpatient_num_cols = col_types(outpatient)[0]
outpatient_cat_cols = col_types(outpatient)[1]

In [14]:
chronic_conds = beneficiary.columns[beneficiary.columns.str.contains('Chronic')].to_list()

for col in beneficiary[chronic_conds]:
    print(beneficiary[col].value_counts())

2    92530
1    46026
Name: ChronicCond_Alzheimer, dtype: int64
2    70154
1    68402
Name: ChronicCond_Heartfailure, dtype: int64
2    95277
1    43279
Name: ChronicCond_KidneyDisease, dtype: int64
2    121935
1     16621
Name: ChronicCond_Cancer, dtype: int64
2    105697
1     32859
Name: ChronicCond_ObstrPulmonary, dtype: int64
2    89296
1    49260
Name: ChronicCond_Depression, dtype: int64
1    83391
2    55165
Name: ChronicCond_Diabetes, dtype: int64
1    93644
2    44912
Name: ChronicCond_IschemicHeart, dtype: int64
2    100497
1     38059
Name: ChronicCond_Osteoporasis, dtype: int64
2    102972
1     35584
Name: ChronicCond_rheumatoidarthritis, dtype: int64
2    127602
1     10954
Name: ChronicCond_stroke, dtype: int64


### Feature Engineering

In [15]:
inpatient['IsOutpatient'] = '0'
outpatient['IsOutpatient'] = '1'