## Setup and Data Import

In [1]:
import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)

from joblib import dump, load

In [2]:
beneficiary = pd.read_csv(
    '../data/Train_Beneficiarydata-1542865627584.csv')
inpatient =  pd.read_csv(
    '../data/Train_Inpatientdata-1542865627584.csv')
outpatient =  pd.read_csv(
    '../data/Train_Outpatientdata-1542865627584.csv')
target = pd.read_csv('../data/Train-1542865627584.csv')

In [3]:
def explore_df(df):
    print('Shape:', df.shape, '\n')
    print('Columns and dtypes:\n', df.dtypes, '\n')

    percent_missing = df.isna().mean().round(4) * 100
    print('Columns with Missingness:\n',
          percent_missing[percent_missing > 0.00
                         ].sort_values(ascending=False))

## Pre-Processing

In [4]:
def date_parser(df, cols):
    for col in cols:
        df[col] = pd.to_datetime(df[col])

date_parser(beneficiary, ['DOB', 'DOD'])
date_parser(inpatient, ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt'])
date_parser(outpatient, ['ClaimStartDt', 'ClaimEndDt'])

In [5]:
def check_obj_dtypes(*dfs):
    for df in dfs:
        object_cols = df.select_dtypes('object').columns.tolist()

        for col in object_cols:
            val_counts = df[col].apply(type).value_counts()
            if len(val_counts) > 1:
                print(f'{"-" * 40}\n', val_counts, f'\n{"-" * 40}\n')
            else:
                print(val_counts, '\n')

# Highlighted columns all contain two dtypes because they have NaNs

In [6]:
#     ONLY DUMMIFY AND CONSOLIDATE IF THERE IS NO RELATIONSHIP BETWEEN THE SINGLE COLS
#     AND THE TARGET VARIABLE

def dummify(*dfs):
    for df in dfs:
        procedure_cols = df.columns[df.columns.str.contains('Procedure')].to_list()
        diagnosis_cols = df.columns[df.columns.str.contains('ClmDiagnosis')].to_list()

        df[procedure_cols] = df[procedure_cols].fillna(0).astype(int)
        for col in procedure_cols:
            df.loc[df[col] > 0, [col]] = 1
        
        df[diagnosis_cols] = df[diagnosis_cols].fillna(0)
        for col in diagnosis_cols:
            df.loc[df[col] != 0, [col]] = 1

# dummify(inpatient, outpatient)

In [7]:
#     ONLY DUMMIFY AND CONSOLIDATE IF THERE IS NO RELATIONSHIP BETWEEN THE SINGLE COLS
#     AND THE TARGET VARIABLE

def consolidate(*dfs):
    for df in dfs:
        procedure_cols = df.columns[df.columns.str.contains('Procedure')].to_list()
        diagnosis_cols = df.columns[df.columns.str.contains('ClmDiagnosis')].to_list()
        physician_cols = df.columns[df.columns.str.contains('OperatingPhysician')
                                   | df.columns.str.contains('OtherPhysician')].to_list()
        
        df['NumProcedureCodes'] = df[procedure_cols].sum(axis=1)
        df['NumDiagnosisCodes'] = df[diagnosis_cols].sum(axis=1)
        df['NumDoctors']        = df[physician_cols].sum(axis=1) + 1 # +1 includes Attending

        df.drop(procedure_cols, axis=1, inplace=True)
        df.drop(diagnosis_cols, axis=1, inplace=True)
        df.drop(physician_cols, axis=1, inplace=True)
        
# consolidate(inpatient, outpatient)

In [8]:
beneficiary.loc[beneficiary.RenalDiseaseIndicator == '0', 'RenalDiseaseIndicator'] = 0
beneficiary.loc[beneficiary.RenalDiseaseIndicator == 'Y', 'RenalDiseaseIndicator'] = 1

In [9]:
cols = beneficiary.columns[beneficiary.columns.str.contains('Gender')
            | beneficiary.columns.str.contains('Race')
            | beneficiary.columns.str.contains('RenalDiseaseIndicator')
            | beneficiary.columns.str.contains('State')
            | beneficiary.columns.str.contains('County')
            | beneficiary.columns.str.contains('Chronic')].to_list()

beneficiary[cols] = \
    beneficiary[cols].apply(lambda x: x.astype('category'))

In [10]:
def cols_by_dtype(df):
    numeric_cols = df.select_dtypes(np.number)
    categorical_cols = df.select_dtypes(['object', 'category'])
    numeric_cols = numeric_cols.columns.to_list()
    categorical_cols = categorical_cols.columns.to_list()
    return numeric_cols, categorical_cols

## Merging

In [11]:
inpatient['IsOutpatient'] = '0'
outpatient['IsOutpatient'] = '1'

In [12]:
claims = pd.concat([inpatient, outpatient])
claims = pd.merge(claims, beneficiary, on='BeneID')
claims = pd.merge(claims, target, on='Provider')

In [13]:
numeric_cols, categorical_cols = cols_by_dtype(claims)[0], cols_by_dtype(claims)[1]

## Pickling

In [15]:
dump(claims, 'claims.pkl')

['claims.pkl']