In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC  
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import roc_curve, auc , f1_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

#For Evaluating models.
import eli5
from eli5.sklearn import PermutationImportance



In [2]:
#import data
leie = pd.read_csv('~/Documents/GitHub/medicare-fraud-detection/data/raw/CMS/UPDATED.csv', low_memory=False)
partb = pd.read_csv('~/Documents/GitHub/medicare-fraud-detection/data/raw/CMS/formB_MUP_PHY_R21_P04_V10_D19_Prov_Svc.csv', encoding = "ISO-8859-1", low_memory=False)
partd = pd.read_csv('~/Documents/GitHub/medicare-fraud-detection/data/raw/CMS/formD_MUP_DPR_RY21_P04_V10_DY19_NPIBN_1.csv', encoding = "ISO-8859-1", low_memory=False)

In [3]:
partb.rename(columns = {'Rndrng_NPI':'npi'}, inplace = True)
partd.rename(columns = {'Prscrbr_NPI':'npi'}, inplace = True)

In [4]:
#list of the most relevant features of each table
partb_feats = ['npi', 'HCPCS_Cd', 'HCPCS_Desc', 'HCPCS_Drug_Ind', 'Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr', 'Tot_Srvcs', 'Tot_Benes', 'Tot_Bene_Day_Srvcs', 'Avg_Sbmtd_Chrg', 'Avg_Mdcr_Pymt_Amt']
partd_feats = ['npi', 'Prscrbr_Type', 'Tot_Benes', 'Tot_Clms', 'Tot_30day_Fills', 'Tot_Day_Suply', 'Tot_Drug_Cst']


In [5]:
#create new tables with relevant features
partb_features = partb[partb_feats]
partd_features = partd[partd_feats]


In [6]:
#preprocess data set B

partb_features = partb_features[partb_features.HCPCS_Drug_Ind.eq('N')]
partb_features.head(5).iloc[1]
partb_features = partb_features.groupby(['npi','Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr']).agg({'Tot_Srvcs':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'Tot_Benes':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'Tot_Bene_Day_Srvcs':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'Avg_Sbmtd_Chrg':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'Avg_Mdcr_Pymt_Amt': ['sum', 'mean', 'median', np.std, 'min', 'max']})
partb_features.columns = ['_'.join(col) for col in partb_features.columns.values]
partb_features.isna().sum()
partb_features.fillna(0, inplace=True)
partb_features = partb_features.reset_index(level=['npi', 'Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr'])


In [7]:
#preprocess data set D
partd_features.columns
partd_features.bene_count = partd_features.Tot_Benes.replace(0,5)
partd_features = partd_features.groupby(['npi','Prscrbr_Type']).agg({'Tot_Benes':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'Tot_Clms':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'Tot_30day_Fills':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'Tot_Day_Suply':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'Tot_Drug_Cst': ['sum', 'mean', 'median', np.std, 'min', 'max']})
partd_features.columns = ['_'.join(col) for col in partd_features.columns.values]
partd_features = partd_features.reset_index(level=['Prscrbr_Type', 'npi'])

In [8]:
leie.rename(columns = {'NPI':'npi'}, inplace = True)

In [9]:
#merge 3 data sets with LEIE on NPI
partb = pd.merge(leie, partb_features, on='npi', how='outer')
partd = pd.merge(leie, partd_features, on='npi', how='outer')

In [10]:
partb = partb[['npi', 'Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr', 'Tot_Srvcs_sum',
       'Tot_Srvcs_mean', 'Tot_Srvcs_median', 'Tot_Srvcs_std',
       'Tot_Srvcs_min', 'Tot_Srvcs_max', 'Tot_Benes_sum',
       'Tot_Benes_mean', 'Tot_Benes_median', 'Tot_Benes_std',
       'Tot_Benes_min', 'Tot_Benes_max', 'Tot_Bene_Day_Srvcs_sum',
       'Tot_Bene_Day_Srvcs_mean', 'Tot_Bene_Day_Srvcs_median',
       'Tot_Bene_Day_Srvcs_std', 'Tot_Bene_Day_Srvcs_min',
       'Tot_Bene_Day_Srvcs_max', 'Avg_Sbmtd_Chrg_sum',
       'Avg_Sbmtd_Chrg_mean', 'Avg_Sbmtd_Chrg_median',
       'Avg_Sbmtd_Chrg_std', 'Avg_Sbmtd_Chrg_min',
       'Avg_Sbmtd_Chrg_max', 'Avg_Mdcr_Pymt_Amt_sum',
       'Avg_Mdcr_Pymt_Amt_mean',
       'Avg_Mdcr_Pymt_Amt_median',
       'Avg_Mdcr_Pymt_Amt_std', 'Avg_Mdcr_Pymt_Amt_min',
       'Avg_Mdcr_Pymt_Amt_max', 'EXCLTYPE',
       'EXCLDATE', 'REINDATE', 'WAIVERDATE', 'WVRSTATE']]

In [11]:
partd = partd[['npi', 'Prscrbr_Type', 'Tot_Benes_sum', 'Tot_Benes_mean', 'Tot_Benes_median',
       'Tot_Benes_std', 'Tot_Benes_min', 'Tot_Benes_max',
       'Tot_Clms_sum', 'Tot_Clms_mean',
       'Tot_Clms_median', 'Tot_Clms_std',
       'Tot_Clms_min', 'Tot_Clms_max',
       'Tot_30day_Fills_sum', 'Tot_30day_Fills_mean',
       'Tot_30day_Fills_median', 'Tot_30day_Fills_std',
       'Tot_30day_Fills_min', 'Tot_30day_Fills_max',
       'Tot_Day_Suply_sum', 'Tot_Day_Suply_mean',
       'Tot_Day_Suply_median', 'Tot_Day_Suply_std',
       'Tot_Day_Suply_min', 'Tot_Day_Suply_max', 'Tot_Drug_Cst_sum',
       'Tot_Drug_Cst_mean', 'Tot_Drug_Cst_median', 'Tot_Drug_Cst_std',
       'Tot_Drug_Cst_min', 'Tot_Drug_Cst_max', 'EXCLTYPE',
       'EXCLDATE', 'REINDATE', 'WAIVERDATE', 'WVRSTATE']]

In [12]:
#remove all rows without npi or provider info
partb = partb[(partb.npi != 0) & (partb.Rndrng_Prvdr_Type)]
partd = partd[(partd.npi != 0) & (partd.Prscrbr_Type)]


In [13]:
#add the label column
partb['TARGET'] = '0'
partd['TARGET'] = '0'


In [14]:
partb['EXCLDATE'] = partb['EXCLDATE'].fillna(20500101).astype(int)
partb['START_EXCLDATE'] = pd.to_datetime(partb['EXCLDATE'], format= '%Y%m%d' )
partb.head()

Unnamed: 0,npi,Rndrng_Prvdr_Type,Rndrng_Prvdr_Gndr,Tot_Srvcs_sum,Tot_Srvcs_mean,Tot_Srvcs_median,Tot_Srvcs_std,Tot_Srvcs_min,Tot_Srvcs_max,Tot_Benes_sum,...,Avg_Mdcr_Pymt_Amt_std,Avg_Mdcr_Pymt_Amt_min,Avg_Mdcr_Pymt_Amt_max,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE,TARGET,START_EXCLDATE
70159,1124292966,Rheumatology,M,680.0,113.333333,98.0,91.698782,16.0,231.0,281.0,...,39.129903,14.39,123.099078,1128a1,20200618,0.0,0.0,,0,2020-06-18
70160,1679754725,Nephrology,M,2502.0,192.461538,38.0,326.925735,11.0,1147.0,951.0,...,48.39965,22.933636,183.028182,1128b7,20200422,0.0,0.0,,0,2020-04-22
70163,1912929787,Podiatry,M,36.0,36.0,36.0,0.0,36.0,36.0,24.0,...,0.0,15.923611,15.923611,1128a4,20220720,0.0,0.0,,0,2022-07-20
70174,1891887048,Anesthesiology,M,11273.0,939.416667,130.0,1727.399804,21.0,5798.0,2868.0,...,39.821169,16.35,150.72402,1128a1,20220320,0.0,0.0,,0,2022-03-20
70202,1972548618,Nephrology,M,205.0,68.333333,89.0,45.6545,16.0,100.0,113.0,...,12.80694,24.698125,50.312,1128a4,20201020,0.0,0.0,,0,2020-10-20


In [15]:
partb['START_EXCLDATE'] = partb['START_EXCLDATE'].dt.year

In [16]:
partd['EXCLDATE'] = partd['EXCLDATE'].fillna(20500101).astype(int)
partd['START_EXCLDATE'] = pd.to_datetime(partd['EXCLDATE'], format= '%Y%m%d' )
partd['START_EXCLDATE'] = partd['START_EXCLDATE'].dt.year

In [17]:
#define the labeling function
def make_labels(data_yr):
    if data_yr < 2025:
        return 'FRAUD'
    else:
        return 'NOT_FRAUD'
        

In [18]:
#apply the labeling function to data sets
partb['TARGET'] = partb[["START_EXCLDATE"]].apply(lambda x: make_labels(*x), axis=1)
partd['TARGET'] = partd[["START_EXCLDATE"]].apply(lambda x: make_labels(*x), axis=1)


In [19]:
#create the combined data set
combined = pd.merge(partb,partd, left_on=['npi', 'Rndrng_Prvdr_Type'], right_on=['npi', 'Prscrbr_Type'])
#drop one gender column (which is duplicate)
combined.drop(columns=['EXCLTYPE_y',
 'EXCLDATE_y','REINDATE_y','WAIVERDATE_y','WVRSTATE_y', 'Prscrbr_Type','EXCLTYPE_x',
 'EXCLDATE_x', 'REINDATE_x', 'WAIVERDATE_x', 'WVRSTATE_x', 'TARGET_x', 'START_EXCLDATE_x', 'START_EXCLDATE_y'],
              inplace=True) 
combined.head()

Unnamed: 0,npi,Rndrng_Prvdr_Type,Rndrng_Prvdr_Gndr,Tot_Srvcs_sum,Tot_Srvcs_mean,Tot_Srvcs_median,Tot_Srvcs_std,Tot_Srvcs_min,Tot_Srvcs_max,Tot_Benes_sum_x,...,Tot_Day_Suply_std,Tot_Day_Suply_min,Tot_Day_Suply_max,Tot_Drug_Cst_sum,Tot_Drug_Cst_mean,Tot_Drug_Cst_median,Tot_Drug_Cst_std,Tot_Drug_Cst_min,Tot_Drug_Cst_max,TARGET_y
0,1124292966,Rheumatology,M,680.0,113.333333,98.0,91.698782,16.0,231.0,281.0,...,2954.380344,308.0,13818.0,1283381.85,32084.54625,3077.69,59804.546888,46.08,301366.05,FRAUD
1,1679754725,Nephrology,M,2502.0,192.461538,38.0,326.925735,11.0,1147.0,951.0,...,1758.639339,100.0,8418.0,78647.09,1787.433864,559.61,3976.609906,41.5,22078.12,FRAUD
2,1912929787,Podiatry,M,36.0,36.0,36.0,0.0,36.0,36.0,24.0,...,582.320072,480.0,2070.0,2439.93,406.655,276.775,337.958636,136.83,1024.57,FRAUD
3,1891887048,Anesthesiology,M,11273.0,939.416667,130.0,1727.399804,21.0,5798.0,2868.0,...,3272.909113,279.0,13553.0,177931.2,5233.270588,1193.19,9908.342463,57.38,51154.05,FRAUD
4,1972548618,Nephrology,M,205.0,68.333333,89.0,45.6545,16.0,100.0,113.0,...,898.461243,56.0,3740.0,125655.92,2026.708387,397.305,5508.886993,58.42,27273.66,FRAUD


In [20]:
combined.rename(columns = {'TARGET_y':'TARGET'}, inplace = True)

In [21]:
#one hot encoding of categorical variables
partb_category_columns = ['Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr']
partd_category_columns = ['Prscrbr_Type']
combined_category_columns = ['Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr']

In [22]:
partb = pd.get_dummies(partb, columns=partb_category_columns, drop_first=True)
partd = pd.get_dummies(partd, columns=partd_category_columns, drop_first=True)
combined = pd.get_dummies(combined, columns=combined_category_columns, drop_first=True)

In [23]:
#drop all unneccessary columns
columns_to_drop = [ 'npi', 'EXCLTYPE', 'WVRSTATE', 'EXCLDATE', 'START_EXCLDATE', 'REINDATE', 'WAIVERDATE' ]

partb.drop(columns_to_drop, axis=1, inplace=True)
partd.drop(columns_to_drop, axis=1, inplace=True)
combined.drop(['npi'], axis=1, inplace=True)

In [24]:
#fill NaN value with 0
partb.fillna(0, inplace = True)
partd.fillna(0, inplace=True)
combined.fillna(0, inplace=True)

In [33]:
from library.sb_utils import save_file
datapath = '~/Documents/GitHub/medicare-fraud-detection/data/processed'
save_file(cleaned_data, 'data_cleaned.csv', datapath)

ModuleNotFoundError: No module named 'library'

In [32]:
!pip install library



In [38]:
!pip uninstall Libraryc

Found existing installation: Library 0.0.0
Uninstalling Library-0.0.0:
  Would remove:
    /Users/toanngo/.conda/envs/quinn_env/lib/python3.9/site-packages/Library-0.0.0.dist-info/*
    /Users/toanngo/.conda/envs/quinn_env/lib/python3.9/site-packages/books/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m
