## Setup and Data Import

In [1]:
from datetime import datetime

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

from joblib import dump, load

In [2]:
beneficiary = pd.read_csv(
    '../data/Train_Beneficiarydata-1542865627584.csv')
inpatient =  pd.read_csv(
    '../data/Train_Inpatientdata-1542865627584.csv')
outpatient =  pd.read_csv(
    '../data/Train_Outpatientdata-1542865627584.csv')
target = pd.read_csv('../data/Train-1542865627584.csv')

In [3]:
def explore_df(df):
    print('Shape:', df.shape, '\n')
    print('Columns and dtypes:\n', df.dtypes, '\n')

    percent_missing = df.isna().mean().round(4) * 100
    print('Columns with Missingness:\n',
          percent_missing[percent_missing > 0.00
                         ].sort_values(ascending=False))

In [4]:
explore_df(beneficiary)

Shape: (138556, 25) 

Columns and dtypes:
 BeneID                             object
DOB                                object
DOD                                object
Gender                              int64
Race                                int64
RenalDiseaseIndicator              object
State                               int64
County                              int64
NoOfMonths_PartACov                 int64
NoOfMonths_PartBCov                 int64
ChronicCond_Alzheimer               int64
ChronicCond_Heartfailure            int64
ChronicCond_KidneyDisease           int64
ChronicCond_Cancer                  int64
ChronicCond_ObstrPulmonary          int64
ChronicCond_Depression              int64
ChronicCond_Diabetes                int64
ChronicCond_IschemicHeart           int64
ChronicCond_Osteoporasis            int64
ChronicCond_rheumatoidarthritis     int64
ChronicCond_stroke                  int64
IPAnnualReimbursementAmt            int64
IPAnnualDeductibleAmt            

In [5]:
explore_df(inpatient)

Shape: (40474, 30) 

Columns and dtypes:
 BeneID                     object
ClaimID                    object
ClaimStartDt               object
ClaimEndDt                 object
Provider                   object
InscClaimAmtReimbursed      int64
AttendingPhysician         object
OperatingPhysician         object
OtherPhysician             object
AdmissionDt                object
ClmAdmitDiagnosisCode      object
DeductibleAmtPaid         float64
DischargeDt                object
DiagnosisGroupCode         object
ClmDiagnosisCode_1         object
ClmDiagnosisCode_2         object
ClmDiagnosisCode_3         object
ClmDiagnosisCode_4         object
ClmDiagnosisCode_5         object
ClmDiagnosisCode_6         object
ClmDiagnosisCode_7         object
ClmDiagnosisCode_8         object
ClmDiagnosisCode_9         object
ClmDiagnosisCode_10        object
ClmProcedureCode_1        float64
ClmProcedureCode_2        float64
ClmProcedureCode_3        float64
ClmProcedureCode_4        float64
ClmPro

In [6]:
explore_df(outpatient)

Shape: (517737, 27) 

Columns and dtypes:
 BeneID                     object
ClaimID                    object
ClaimStartDt               object
ClaimEndDt                 object
Provider                   object
InscClaimAmtReimbursed      int64
AttendingPhysician         object
OperatingPhysician         object
OtherPhysician             object
ClmDiagnosisCode_1         object
ClmDiagnosisCode_2         object
ClmDiagnosisCode_3         object
ClmDiagnosisCode_4         object
ClmDiagnosisCode_5         object
ClmDiagnosisCode_6         object
ClmDiagnosisCode_7         object
ClmDiagnosisCode_8         object
ClmDiagnosisCode_9         object
ClmDiagnosisCode_10        object
ClmProcedureCode_1        float64
ClmProcedureCode_2        float64
ClmProcedureCode_3        float64
ClmProcedureCode_4        float64
ClmProcedureCode_5        float64
ClmProcedureCode_6        float64
DeductibleAmtPaid           int64
ClmAdmitDiagnosisCode      object
dtype: object 

Columns with Missingnes

In [7]:
explore_df(target)

Shape: (5410, 2) 

Columns and dtypes:
 Provider          object
PotentialFraud    object
dtype: object 

Columns with Missingness:
 Series([], dtype: float64)


## Pre-Processing

In [8]:
def date_parser(df, cols):
    for col in cols:
        df[col] = pd.to_datetime(df[col])

date_parser(beneficiary, ['DOB', 'DOD'])
date_parser(inpatient, ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt'])
date_parser(outpatient, ['ClaimStartDt', 'ClaimEndDt'])

In [9]:
def check_obj_dtypes(*dfs):
    for df in dfs:
        object_cols = df.select_dtypes('object').columns.tolist()

        for col in object_cols:
            val_counts = df[col].apply(type).value_counts()
            if len(val_counts) > 1:
                print(f'{"-" * 40}\n', val_counts, f'\n{"-" * 40}\n')
            else:
                print(val_counts, '\n')

# Highlighted columns all contain two dtypes because they have NaNs

In [10]:
def dummify(*dfs):
    for df in dfs:
        procedure_cols = df.columns[df.columns.str.contains('Procedure')].to_list()
        diagnosis_cols = df.columns[df.columns.str.contains('ClmDiagnosis')].to_list()

        df[procedure_cols] = df[procedure_cols].fillna(0).astype(int)
        for col in procedure_cols:
            df.loc[df[col] > 0, [col]] = 1
        
        df[diagnosis_cols] = df[diagnosis_cols].fillna(0)
        for col in diagnosis_cols:
            df.loc[df[col] != 0, [col]] = 1

dummify(inpatient, outpatient)

In [11]:
def consolidate(*dfs):
    for df in dfs:
        procedure_cols = df.columns[df.columns.str.contains('Procedure')].to_list()
        diagnosis_cols = df.columns[df.columns.str.contains('ClmDiagnosis')].to_list()
        
        df['NumProcedureCodes'] = df[procedure_cols].sum(axis=1)
        df['NumDiagnosisCodes'] = df[diagnosis_cols].sum(axis=1)
        
        df.drop(procedure_cols, axis=1, inplace=True)
        df.drop(diagnosis_cols, axis=1, inplace=True)
        
consolidate(inpatient, outpatient)

In [12]:
cols = beneficiary.columns[beneficiary.columns.str.contains('Gender')
            | beneficiary.columns.str.contains('Race')
            | beneficiary.columns.str.contains('RenalDiseaseIndicator')
            | beneficiary.columns.str.contains('State')
            | beneficiary.columns.str.contains('County')
            | beneficiary.columns.str.contains('Chronic')].to_list()

beneficiary[cols] = \
    beneficiary[cols].apply(lambda x: x.astype('str').astype('category'))

In [13]:
def col_types(df):
    numeric_cols = df.select_dtypes(np.number)
    categorical_cols = df.select_dtypes(['object', 'category'])
    numeric_cols = numeric_cols.columns.to_list()
    categorical_cols = categorical_cols.columns.to_list()
    return numeric_cols, categorical_cols

# Each variable below is a list of column names
beneficiary_num_cols, beneficiary_cat_cols = col_types(beneficiary)[0], col_types(beneficiary)[1]
inpatient_num_cols, inpatient_cat_cols = col_types(inpatient)[0], col_types(inpatient)[1]
outpatient_num_cols, outpatient_cat_cols = col_types(outpatient)[0], col_types(outpatient)[1]

In [14]:
print(beneficiary_num_cols, '\n')
print(beneficiary_cat_cols)

['NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt'] 

['BeneID', 'Gender', 'Race', 'RenalDiseaseIndicator', 'State', 'County', 'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease', 'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression', 'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke']


In [15]:
print(inpatient_num_cols, '\n')
print(inpatient_cat_cols,)

['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'NumProcedureCodes', 'NumDiagnosisCodes'] 

['BeneID', 'ClaimID', 'Provider', 'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician', 'ClmAdmitDiagnosisCode', 'DiagnosisGroupCode']


In [16]:
print(outpatient_num_cols, '\n')
print(outpatient_cat_cols)

['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'NumProcedureCodes', 'NumDiagnosisCodes'] 

['BeneID', 'ClaimID', 'Provider', 'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician', 'ClmAdmitDiagnosisCode']


## Merging

In [17]:
inpatient['IsOutpatient'] = '0'
outpatient['IsOutpatient'] = '1'

## Pickling