## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import load

import Functions as fxns
from Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

In [2]:
# !python ../Preprocessing.py

In [3]:
claims = load('../claims.pkl')

## Claims DF

### New Columns

In [4]:
claims['AgeAtService'] = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)
claims['HasDied'] = claims.DOD.notna()

physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
claims['HasAnyPhysician'] = claims[physician_cols].notna().any(axis=1)
claims['HasAllPhysicians'] = claims[physician_cols].notna().all(axis=1)

claims['ClaimDuration'] = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['IPDuration'] = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]
claims['IPClaimDuration'] = \
    (inpatient_claims.ClaimEndDt
     - inpatient_claims.ClaimStartDt).dt.days + 1
claims['OPClaimDuration'] = \
    (outpatient_claims.ClaimEndDt
     - outpatient_claims.ClaimStartDt).dt.days + 1

claims['ClaimCost'] = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['PercInsCovered'] = \
    round((claims.InscClaimAmtReimbursed / claims.ClaimCost) * 100)
claims['DailyClaimCost'] = \
    round(claims.ClaimCost / claims.ClaimDuration)

In [5]:
claim_cols = claims.columns.to_list()
claim_cols.remove('Provider')

### Variables

In [6]:
# lists:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]
physician_cols   = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols   = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols   = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

# dataframes:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

## Providers DF

### Data Import

In [7]:
lucas = pd.read_csv('./data/providers.csv')
lucas.columns = ['Provider', 'DualPatients_Perc', 'HasNoPhys_Perc', 'HasTop5AdmtCode']

In [8]:
ryan = pd.read_csv('./data/Ryan_providers.csv')
ryan.drop('Unnamed: 0', axis=1, inplace=True)
ryan.rename(columns = {'Patient_Attphy_Ratio': 'BenesPerAttPhys',
                'Patient_Operphy_Ratio': 'BenesPerOperPhys',
                'Patient_Otherphy_Ratio': 'BenesPerOthPhys',
                'BeneID_Nunique_IP': 'IP_UniqueBenes_Count',
                'BeneID_Nunique_OP': 'OP_UniqueBenes_Count',
                'State_Nunique_IP': 'IP_UniqueSt_Count',
                'State_Nunique_OP': 'OP_UniqueSt_Count'}, inplace=True)

In [9]:
providers = pd.merge(lucas, ryan)

### Beneficiaries

In [10]:
providers['GenderZero_Perc'] = claims.groupby('Provider').Gender.mean().values
providers['HasRenalDisease_Perc'] = claims.groupby('Provider').RenalDisease.mean().values
providers['HasDied_Perc'] = claims.groupby('Provider').HasDied.mean().values
providers['IsOutpatient'] = claims.groupby('Provider').IsOutpatient.mean().values

In [11]:
cond_by_ip = \
    inpatient_claims.groupby('Provider')[chronic_cols].sum().reset_index()
cond_by_op = \
    outpatient_claims.groupby('Provider')[chronic_cols].sum().reset_index()

def conditions_by_perc(df, cols, pat_type):
    percs = pd.DataFrame(providers.Provider)
    for col in cols:
        percs[f'{col}_{pat_type}_Perc'] = df[col].mean()
    return pd.merge(providers, percs)

providers = conditions_by_perc(cond_by_ip, chronic_cols, 'IP')
providers = conditions_by_perc(cond_by_op, chronic_cols, 'OP')

### Doctors

In [12]:
providers['HasAnyPhysician'] = \
    claims.groupby('Provider').HasAnyPhysician.mean().values
providers['HasAllPhysicians'] = \
    claims.groupby('Provider').HasAllPhysicians.mean().values

### Money

## MISC

In [31]:
# def add_mean_summary_col(groupby_col, mean_col, newcol1, newcol2):
#     outpatient_claims = claims.loc[claims.IsOutpatient == 1]
#     inpatient_claims  = claims.loc[claims.IsOutpatient == 0]
    
#     ip = fxns.add_mean_per_col(
#         inpatient_claims, groupby_col, mean_col, newcol1)
#     op = fxns.add_mean_per_col(
#         outpatient_claims, groupby_col, mean_col, newcol2)
    
#     expanded_claims = pd.concat([ip, op])
#     return expanded_claims

In [32]:
# # Average chronic conditions per inpatients
# for col in chronic_cols:
#     sum_stats = \
#         add_mean_summary_col('BeneID', col, f'{col}_perIP', f'{col}_perOP')

In [33]:
# sum_stats = fxns.add_count_per_col(claims, 'BeneID', 'ClaimID', 'ClaimsPerBene')

# sum_stats = fxns.add_count_per_col(
#     claims, 'AttendingPhysician', 'ClaimID', 'ClaimsPerAttPhy')
# sum_stats = fxns.add_count_per_col(
#     claims, 'OperatingPhysician', 'ClaimID', 'ClaimsPerOperPhys')
# sum_stats = fxns.add_count_per_col(
#     claims, 'OtherPhysician', 'ClaimID', 'ClaimsPerOtherPhys')

In [34]:
# sum_stats = add_mean_summary_col('BeneID', 'InscClaimAmtReimbursed',
#                          'Reimb_perIP', 'Reimb_perOP')
# sum_stats = add_mean_summary_col('BeneID', 'DeductibleAmtPaid',
#                          'Deductible_perIP', 'Deductible_perOP')
# sum_stats = add_mean_summary_col('BeneID', 'PercInsCovered',
#                          'PercInsCovered_perIP', 'PercInsCovered_perOP')
# sum_stats = add_mean_summary_col('BeneID', 'DailyClaimCost',
#                          'DailyClaimCost_perIP', 'DailyClaimCost_perOP')

In [39]:
# temp_cols = claims.groupby('Provider').mean().columns.to_list()
# temp_cols
# temp = sum_stats.groupby('Provider').mean()
# temp.drop(temp_cols, axis=1, inplace=True)
# print(len(temp))
# temp.sample(10)