## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import load

import Functions as fxns
from Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

from sklearn.impute import SimpleImputer

In [2]:
# !python ../Preprocessing.py

In [3]:
claims = load('../claims.pkl')

## Claims DF

### New Columns

In [4]:
claims['AgeAtService'] = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)

claims['HasDied'] = claims.DOD.notna()
# claims.loc[claims.HasDied == True, 'HasDied'] = 1
# claims.loc[claims.HasDied == False, 'HasDied'] = 0
# claims.HasDied = claims.HasDied.astype(str).astype(int)
fxns.re_encode_bool(claims, ['HasDied'])

In [5]:
physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()

claims['HasAllPhysicians'] = claims[physician_cols].notna().all(axis=1)
# claims.loc[claims.HasAllPhysicians == True, 'HasAllPhysicians'] = 1
# claims.loc[claims.HasAllPhysicians == False, 'HasAllPhysicians'] = 0
# claims.HasAllPhysicians = claims.HasAllPhysicians.astype(str).astype(int)

claims['HasAllPhysicians'] = claims[physician_cols].notna().any(axis=1)
# claims.loc[claims.HasAnyPhysicians == True, 'HasAnyPhysicians'] = 1
# claims.loc[claims.HasAnyPhysicians == False, 'HasAnyPhysicians'] = 0
# claims.HasAnyPhysicians = claims.HasAnyPhysicians.astype(str).astype(int)

claims['HasNoPhysicians'] = claims[physician_cols].isna().all(axis=1)
# claims.loc[claims.HasNoPhysicians == True, 'HasNoPhysicians'] = 1
# claims.loc[claims.HasNoPhysicians == False, 'HasNoPhysicians'] = 0
# claims.HasNoPhysicians = claims.HasNoPhysicians.astype(str).astype(int)

fxns.re_encode_bool(claims, ['HasAllPhysicians', 'HasAllPhysicians', 'HasNoPhysicians'])

In [6]:
claims['ClaimDuration'] = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['IPDuration'] = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

In [7]:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]
claims['IPClaimDuration'] = \
    (inpatient_claims.ClaimEndDt
     - inpatient_claims.ClaimStartDt).dt.days + 1
claims['OPClaimDuration'] = \
    (outpatient_claims.ClaimEndDt
     - outpatient_claims.ClaimStartDt).dt.days + 1

In [8]:
claims['ClaimCost'] = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['PercInsCovered'] = \
    round((claims.InscClaimAmtReimbursed / claims.ClaimCost) * 100)
claims['DailyClaimCost'] = \
    round(claims.ClaimCost / claims.ClaimDuration)

In [9]:
claim_cols = claims.columns.to_list()
claim_cols.remove('Provider')

### Variables

In [10]:
# lists:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]
physician_cols   = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
chronic_cols     = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols   = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols   = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

# dataframes:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

## Providers DF

### Data Import

In [11]:
providers = claims.groupby(['PotentialFraud', 'Provider']).size().reset_index().drop(0, axis=1)

In [29]:
lucas = pd.read_csv('./data/providers.csv')
lucas.columns = ['Provider', 'DualPatientType_Perc', 'HasNoPhys_Perc', 'HasTop5AdmtCode']

Index(['Provider', 'percentage_InOutpatients', 'percentage_noPhysician',
       'isTop5admtcode'],
      dtype='object')


In [13]:
ryan = pd.read_csv('./data/Ryan_providers.csv')
ryan.drop('Unnamed: 0', axis=1, inplace=True)
ryan.rename(columns = {'Patient_Attphy_Ratio': 'BenesPerAttPhys',
                'Patient_Operphy_Ratio': 'BenesPerOperPhys',
                'Patient_Otherphy_Ratio': 'BenesPerOthPhys',
                'BeneID_Nunique_IP': 'IP_UniqueBenes_Count',
                'BeneID_Nunique_OP': 'OP_UniqueBenes_Count',
                'State_Nunique_IP': 'IP_UniqueSt_Count',
                'State_Nunique_OP': 'OP_UniqueSt_Count'}, inplace=True)

In [14]:
providers = providers.merge(lucas).merge(ryan)

### Beneficiaries

In [15]:
providers['GenderZero_Perc'] = claims.groupby('Provider').Gender.mean().values
providers['HasRenalDisease_Perc'] = claims.groupby('Provider').RenalDisease.mean().values
providers['HasDied_Perc'] = claims.groupby('Provider').HasDied.mean().values
providers['Outpatient_Perc'] = claims.groupby('Provider').IsOutpatient.mean().values

In [16]:
# THIS IS DOING WEIRD THINGS

# cond_by_ip = \
#     inpatient_claims.groupby('Provider')[chronic_cols].sum().reset_index()
# cond_by_op = \
#     outpatient_claims.groupby('Provider')[chronic_cols].sum().reset_index()

# def mult_cols_mean_by_perc(df, cols, pat_type):
#     percs = pd.DataFrame(providers.Provider)
#     for col in cols:
#         percs[f'{col}_{pat_type}_Perc'] = df[col].mean()
#     return pd.merge(providers, percs)

# providers = mult_cols_mean_by_perc(cond_by_ip, chronic_cols, 'IP')
# providers = mult_cols_mean_by_perc(cond_by_op, chronic_cols, 'OP')

### Doctors

In [17]:
# THIS IS DOING WEIRD THINGS

# new_phys_cols = ['HasAllPhysicians', 'HasAnyPhysicians', 'HasNoPhysicians']

# phys_counts_by_ip = \
#     inpatient_claims.groupby('Provider')[new_phys_cols].mean().reset_index()
# phys_counts_by_op = \
#     outpatient_claims.groupby('Provider')[new_phys_cols].mean().reset_index()

# def single_cols_mean_by_perc(df, cols, pat_type):
#     percs = pd.DataFrame(providers.Provider)
#     for col in cols:
#         percs[f'{col}_{pat_type}_Perc'] = df[col]
#     return pd.merge(providers, percs)


# providers = single_cols_mean_by_perc(phys_counts_by_ip, new_phys_cols, 'IP')
# providers = single_cols_mean_by_perc(phys_counts_by_ip, new_phys_cols, 'OP')

### Money

## MISC

In [18]:
# means = claims.groupby('BeneID').mean()
# means.columns


# ['IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
#  'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt',
#  'IPDuration', 'IPClaimDuration', 'OPClaimDuration']

In [19]:
# outpatient_claims = claims.loc[claims.IsOutpatient == 1]
# inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

# avg_ip_cols = inpatient_claims.groupby('BeneID')[[
#     'InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'NoOfMonths_PartACov',
#     'NoOfMonths_PartBCov', 'AgeAtService', 'HasDied', 'PercInsCovered',
#     'HasAllPhysicians', 'HasAnyPhysicians', 'HasNoPhysicians',
#     'ClaimDuration', 'ClaimCost', 'DailyClaimCost'
#     ]].mean().add_suffix('_IP_Mean').reset_index()
# avg_op_cols = outpatient_claims.groupby('BeneID')[[
#     'InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'NoOfMonths_PartACov',
#     'NoOfMonths_PartBCov', 'AgeAtService', 'HasDied', 'PercInsCovered',
#     'HasAllPhysicians', 'HasAnyPhysicians', 'HasNoPhysicians',
#     'ClaimDuration', 'ClaimCost', 'DailyClaimCost'
#     ]].mean().add_suffix('_OP_Mean').reset_index()
# avg_cols = avg_ip_cols.merge(avg_op_cols, on='BeneID')

In [20]:
providers.columns.sort_values()

Index(['Att_Phy_Mult', 'BenesPerAttPhys', 'BenesPerOperPhys',
       'BenesPerOthPhys', 'DualPatients_Perc', 'GenderZero_Perc',
       'HasDied_Perc', 'HasNoPhys_Perc', 'HasRenalDisease_Perc',
       'HasTop5AdmtCode', 'IP_Dup_Perc', 'IP_No_Proc_Perc',
       'IP_UniqueBenes_Count', 'IP_UniqueSt_Count', 'In_Top5_St_Perc',
       'OP_Dup_Perc', 'OP_No_Diag_Perc', 'OP_UniqueBenes_Count',
       'OP_UniqueSt_Count', 'Oper_Phy_Mult', 'Other_Phy_Mult',
       'Outpatient_Perc', 'PotentialFraud', 'Provider'],
      dtype='object')

In [21]:
providers.sample(10)

Unnamed: 0,PotentialFraud,Provider,DualPatients_Perc,HasNoPhys_Perc,HasTop5AdmtCode,BenesPerAttPhys,BenesPerOperPhys,BenesPerOthPhys,Att_Phy_Mult,Oper_Phy_Mult,Other_Phy_Mult,IP_Dup_Perc,OP_Dup_Perc,OP_No_Diag_Perc,IP_No_Proc_Perc,In_Top5_St_Perc,IP_UniqueBenes_Count,IP_UniqueSt_Count,OP_UniqueBenes_Count,OP_UniqueSt_Count,GenderZero_Perc,HasRenalDisease_Perc,HasDied_Perc,Outpatient_Perc
2495,0,PRV54422,0.0,,0.0,7,10,9,0.266667,0.0,0.0,,0.6,0.0,,0.0,,,12.0,2.0,0.429719,0.13253,0.004016,0.783133
1082,0,PRV52507,0.0,,0.0,9,20,14,0.172414,0.0,0.034483,,0.448276,0.0,,0.0,,,24.0,2.0,0.40625,0.125,0.03125,1.0
2308,0,PRV54167,0.0,,0.0,1,5,6,0.444444,0.0,0.0,,0.333333,0.0,,0.0,,,8.0,1.0,0.533333,0.0,0.0,1.0
1339,0,PRV52838,0.024096,,0.024096,53,54,54,0.240964,0.012048,0.048193,,0.2875,0.036145,,0.0,,,61.0,2.0,0.282051,0.128205,0.0,1.0
4615,0,PRV57390,0.0,,0.0,41,41,41,0.0,0.0,0.0,,0.407407,0.018182,,0.0,,,42.0,2.0,0.5,0.5,0.0,1.0
2354,0,PRV54224,0.0,,0.0,18,20,18,0.069767,0.0,0.069767,,0.380952,0.023256,,0.0,,,23.0,1.0,0.453608,0.185567,0.0,1.0
4829,0,PRV57676,0.0,,0.0,10,14,14,0.066667,0.0,0.0,,0.285714,0.066667,,1.0,,,15.0,1.0,1.0,1.0,0.0,1.0
3597,0,PRV55978,0.034913,0.007481,0.034913,176,263,253,0.004988,0.007481,0.0,0.086957,0.508251,0.019417,0.423913,0.147132,87.0,7.0,257.0,13.0,0.450639,0.207898,0.004646,1.0
1822,0,PRV53504,0.0,,0.0,0,3,2,0.5,0.0,0.25,,0.75,0.0,,0.0,,,4.0,1.0,0.333333,0.458333,0.0,0.833333
2503,0,PRV54431,0.0,,0.0,2,8,7,0.777778,0.0,0.0,,0.375,0.111111,,0.0,,,9.0,1.0,0.428571,0.071429,0.0,1.0


In [22]:
# NEEDS TO BE PER INPATIENT/OUTPATIENT:
# HasRenalDisease_Perc
# HasDied_Perc
# Has...Physicians (3 cols)

In [23]:
# def add_mean_summary_col(groupby_col, mean_col, newcol1, newcol2):
#     outpatient_claims = claims.loc[claims.IsOutpatient == 1]
#     inpatient_claims  = claims.loc[claims.IsOutpatient == 0]
    
#     ip = fxns.add_mean_per_col(
#         inpatient_claims, groupby_col, mean_col, newcol1)
#     op = fxns.add_mean_per_col(
#         outpatient_claims, groupby_col, mean_col, newcol2)
    
#     expanded_claims = pd.concat([ip, op])
#     return expanded_claims

In [24]:
# sum_stats = fxns.add_count_per_col(claims, 'BeneID', 'ClaimID', 'ClaimsPerBene')

# sum_stats = fxns.add_count_per_col(
#     claims, 'AttendingPhysician', 'ClaimID', 'ClaimsPerAttPhy')
# sum_stats = fxns.add_count_per_col(
#     claims, 'OperatingPhysician', 'ClaimID', 'ClaimsPerOperPhys')
# sum_stats = fxns.add_count_per_col(
#     claims, 'OtherPhysician', 'ClaimID', 'ClaimsPerOtherPhys')

In [25]:
# sum_stats = add_mean_summary_col('BeneID', 'InscClaimAmtReimbursed',
#                          'Reimb_perIP', 'Reimb_perOP')
# sum_stats = add_mean_summary_col('BeneID', 'DeductibleAmtPaid',
#                          'Deductible_perIP', 'Deductible_perOP')
# sum_stats = add_mean_summary_col('BeneID', 'PercInsCovered',
#                          'PercInsCovered_perIP', 'PercInsCovered_perOP')
# sum_stats = add_mean_summary_col('BeneID', 'DailyClaimCost',
#                          'DailyClaimCost_perIP', 'DailyClaimCost_perOP')

In [26]:
# temp_cols = claims.groupby('Provider').mean().columns.to_list()
# temp_cols
# temp = sum_stats.groupby('Provider').mean()
# temp.drop(temp_cols, axis=1, inplace=True)
# print(len(temp))
# temp.sample(10)