In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC  
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import roc_curve, auc , f1_score
from sklearn.metrics import confusion_matrix


In [2]:
df_label = pd.read_csv('~/Documents/GitHub/medicare-fraud-detection/data/raw/CMS/UPDATED.csv', low_memory=False)
print(df_label.shape)
print(df_label.head())

(76021, 18)
  LASTNAME FIRSTNAME MIDNAME                      BUSNAME         GENERAL  \
0      NaN       NaN            #1 MARKETING SERVICE, INC  OTHER BUSINESS   
1      NaN       NaN          101 FIRST CARE PHARMACY INC  OTHER BUSINESS   
2      NaN       NaN             14 LAWRENCE AVE PHARMACY        PHARMACY   
3      NaN       NaN             143 MEDICAL EQUIPMENT CO     DME COMPANY   
4      NaN       NaN           184TH STREET PHARMACY CORP  OTHER BUSINESS   

      SPECIALTY UPIN         NPI  DOB                        ADDRESS  \
0    SOBER HOME  NaN           0  NaN      239 BRIGHTON BEACH AVENUE   
1      PHARMACY  NaN  1972902351  NaN  C/O 609 W 191ST STREET, APT D   
2           NaN  NaN           0  NaN             14 LAWRENCE AVENUE   
3  DME - OXYGEN  NaN           0  NaN               701 NW 36 AVENUE   
4      PHARMACY  NaN  1922348218  NaN                  69 E 184TH ST   

        CITY STATE    ZIP EXCLTYPE  EXCLDATE  REINDATE  WAIVERDATE WVRSTATE  
0   BROOKLYN  

In [3]:
print(df_label.info())
print(df_label.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76021 entries, 0 to 76020
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   LASTNAME    72791 non-null  object 
 1   FIRSTNAME   72790 non-null  object 
 2   MIDNAME     76021 non-null  object 
 3   BUSNAME     3228 non-null   object 
 4   GENERAL     76021 non-null  object 
 5   SPECIALTY   71871 non-null  object 
 6   UPIN        6200 non-null   object 
 7   NPI         76021 non-null  int64  
 8   DOB         71930 non-null  float64
 9   ADDRESS     76012 non-null  object 
 10  CITY        76020 non-null  object 
 11  STATE       76016 non-null  object 
 12  ZIP         76021 non-null  int64  
 13  EXCLTYPE    76021 non-null  object 
 14  EXCLDATE    76021 non-null  int64  
 15  REINDATE    76021 non-null  int64  
 16  WAIVERDATE  76021 non-null  int64  
 17  WVRSTATE    11 non-null     object 
dtypes: float64(1), int64(5), object(12)
memory usage: 10.4+ MB
None
  

In [4]:
#get the list of all fraud NPI
df_label_fraud = df_label[df_label['NPI'] != 0]
label = df_label_fraud['NPI'].unique()
print(len(label))

6150


In [5]:
#check if NPI is unique
print(df_label_fraud['NPI'].value_counts())
df_label_fraud[df_label_fraud['NPI'] == 1235282088]

1225072028    3
1801839139    3
1235282088    2
1356339055    2
1801880638    2
             ..
1942389754    1
1770677072    1
1225190606    1
1942495080    1
1174561708    1
Name: NPI, Length: 6150, dtype: int64


Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE
61830,SCHWARTZ,IRVING,JACK,,"PHYSICIAN (MD, DO)",GENERAL PRACTICE,A28402,1235282088,19441117.0,"605 QUEENS AVE, APT 17",YUBA CITY,CA,95991,1128a1,20121018,0,0,
61831,SCHWARTZ,IRVING,JACK,,"PHYSICIAN (MD, DO)",GENERAL PRACTICE,A28402,1235282088,19441117.0,"3901 KLEIN BLVD, #68935-097",LOMPOC,CA,93436,1128a1,20140220,0,0,


In [6]:
#we check the stats of columns again, REINDATE and WAIVERDATE contain no valuable info
print(df_label_fraud.describe())
print(df_label_fraud['WAIVERDATE'].value_counts())

                NPI           DOB           ZIP      EXCLDATE  REINDATE  \
count  6.280000e+03  5.863000e+03   6280.000000  6.280000e+03    6280.0   
mean   1.498480e+09  1.960458e+07  49399.880096  2.015797e+07       0.0   
std    2.882032e+08  1.245219e+05  30372.177613  3.946844e+04       0.0   
min    1.003000e+09  1.921102e+07      0.000000  1.988070e+07       0.0   
25%    1.255305e+09  1.952016e+07  23452.750000  2.013082e+07       0.0   
50%    1.497939e+09  1.960042e+07  44327.000000  2.016062e+07       0.0   
75%    1.740606e+09  1.969122e+07  78212.500000  2.019042e+07       0.0   
max    1.993000e+09  1.994122e+07  99901.000000  2.022072e+07       0.0   

         WAIVERDATE  
count  6.280000e+03  
mean   1.922997e+04  
std    6.218851e+05  
min    0.000000e+00  
25%    0.000000e+00  
50%    0.000000e+00  
75%    0.000000e+00  
max    2.016022e+07  
0           6274
20090618       1
20160218       1
20140917       1
20150618       1
20110720       1
20111115       1
Name: W

In [7]:
#the excldate ranges from 1988 to 2022
print(df_label_fraud['EXCLDATE'].sort_values())
print(df_label_fraud['REINDATE'].value_counts())
#there is no info on REINDATE, so the provider who has been excluded did not get back 

9384     19880703
4847     19881006
7664     19930516
9418     19931022
6382     19931121
           ...   
54559    20220720
11622    20220720
74770    20220720
11239    20220720
63864    20220720
Name: EXCLDATE, Length: 6280, dtype: int64
0    6280
Name: REINDATE, dtype: int64


In [8]:
#We now check the waiverdate, assumption is the excluded provider with waiver can still work in health care programs
print(df_label_fraud[df_label_fraud['WAIVERDATE'] != 0])

         LASTNAME FIRSTNAME         MIDNAME                         BUSNAME  \
3035          NaN       NaN                  UNION COUNTY MEDICAL CENTER IN   
5957        ASWAD   MOHAMED          BASEL                              NaN   
40983       LENTZ     RANDY          SCOTT                              NaN   
47705     MIRANDA   EDUARDO          SIRIA                              NaN   
53405  PATWARDHAN     VINOD  CHANDRASHEKAR                              NaN   
54706     PICKENS      CORY            LEE                              NaN   

                    GENERAL          SPECIALTY    UPIN         NPI  \
3035                 CLINIC                NaN     NaN  1093935355   
5957     PHYSICIAN (MD, DO)           ONCOLOGY  H95172  1871571406   
40983  MEDICAL PRACTICE, MD    FAMILY PRACTICE  H46326  1205832227   
47705    PHYSICIAN (MD, DO)  INTERNAL MEDICINE     NaN  1285673012   
53405  MEDICAL PRACTICE, MD           ONCOLOGY  A87224  1114922606   
54706    PHYSICIAN (MD, DO

In [9]:
#Q: can the waiverdate is before the excldate? 

In [10]:
#import mup_dme, this dataset presents information on DMEPOS products and services provided to Medicare beneficiaries ordered by physicians and other healthcare professionals.  
#description for columns at https://data.cms.gov/resources/medicare-durable-medical-equipment-devices-supplies-by-referring-provider-and-service-data-dictionary
#rfrg = referring
df_mup_dme = pd.read_csv('~/Documents/GitHub/medicare-fraud-detection/data/raw/CMS/mup_dme_ry21_p05_v10_dy19_prvhpr_0.csv', low_memory=False)
print(df_mup_dme.shape)
print(df_mup_dme.columns)
print(df_mup_dme.head())

(1656449, 34)
Index(['Rfrg_NPI', 'Rfrg_Prvdr_Last_Name_Org', 'Rfrg_Prvdr_First_Name',
       'Rfrg_Prvdr_MI', 'Rfrg_Prvdr_Crdntls', 'Rfrg_Prvdr_Gndr',
       'Rfrg_Prvdr_Ent_Cd', 'Rfrg_Prvdr_St1', 'Rfrg_Prvdr_St2',
       'Rfrg_Prvdr_City', 'Rfrg_Prvdr_State_Abrvtn', 'Rfrg_Prvdr_State_FIPS',
       'Rfrg_Prvdr_Zip5', 'Rfrg_Prvdr_RUCA_CAT', 'Rfrg_Prvdr_RUCA',
       'Rfrg_Prvdr_RUCA_Desc', 'Rfrg_Prvdr_Cntry', 'Rfrg_Prvdr_Type_cd',
       'Rfrg_Prvdr_Type', 'Rfrg_Prvdr_Type_Flag', 'BETOS_Lvl', 'BETOS_Cd',
       'BETOS_Desc', 'HCPCS_CD', 'HCPCS_Desc', 'Suplr_Rentl_Ind', 'Tot_Suplrs',
       'Tot_Suplr_Benes', 'Tot_Suplr_Clms', 'Tot_Suplr_Srvcs',
       'Avg_Suplr_Sbmtd_Chrg', 'Avg_Suplr_Mdcr_Alowd_Amt',
       'Avg_Suplr_Mdcr_Pymt_Amt', 'Avg_Suplr_Mdcr_Stdzd_Amt'],
      dtype='object')
     Rfrg_NPI Rfrg_Prvdr_Last_Name_Org Rfrg_Prvdr_First_Name Rfrg_Prvdr_MI  \
0  1003000126                Enkeshafi               Ardalan           NaN   
1  1003000126                Enkeshafi          

In [22]:
df_mup_dme[df_mup_dme['Rfrg_Prvdr_St1'].str.upper() == 'LAWRENCE AVE']

Unnamed: 0,Rfrg_NPI,Rfrg_Prvdr_Last_Name_Org,Rfrg_Prvdr_First_Name,Rfrg_Prvdr_MI,Rfrg_Prvdr_Crdntls,Rfrg_Prvdr_Gndr,Rfrg_Prvdr_Ent_Cd,Rfrg_Prvdr_St1,Rfrg_Prvdr_St2,Rfrg_Prvdr_City,...,HCPCS_Desc,Suplr_Rentl_Ind,Tot_Suplrs,Tot_Suplr_Benes,Tot_Suplr_Clms,Tot_Suplr_Srvcs,Avg_Suplr_Sbmtd_Chrg,Avg_Suplr_Mdcr_Alowd_Amt,Avg_Suplr_Mdcr_Pymt_Amt,Avg_Suplr_Mdcr_Stdzd_Amt


In [11]:
#check the stats
print(df_mup_dme.info())
print(df_mup_dme.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1656449 entries, 0 to 1656448
Data columns (total 34 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Rfrg_NPI                  1656449 non-null  int64  
 1   Rfrg_Prvdr_Last_Name_Org  1656449 non-null  object 
 2   Rfrg_Prvdr_First_Name     1656403 non-null  object 
 3   Rfrg_Prvdr_MI             1219478 non-null  object 
 4   Rfrg_Prvdr_Crdntls        1610499 non-null  object 
 5   Rfrg_Prvdr_Gndr           1656403 non-null  object 
 6   Rfrg_Prvdr_Ent_Cd         1656449 non-null  object 
 7   Rfrg_Prvdr_St1            1656449 non-null  object 
 8   Rfrg_Prvdr_St2            576658 non-null   object 
 9   Rfrg_Prvdr_City           1656449 non-null  object 
 10  Rfrg_Prvdr_State_Abrvtn   1656449 non-null  object 
 11  Rfrg_Prvdr_State_FIPS     1656449 non-null  object 
 12  Rfrg_Prvdr_Zip5           1656449 non-null  object 
 13  Rfrg_Prvdr_RUCA_CAT       1

In [12]:
#import data set B
#description for columns at https://data.cms.gov/resources/medicare-durable-medical-equipment-devices-supplies-by-referring-provider-and-service-data-dictionary
#rndrng = rendering

df_B = pd.read_csv('~/Documents/GitHub/medicare-fraud-detection/data/raw/CMS/formB_MUP_PHY_R21_P04_V10_D19_Prov_Svc.csv', encoding = "ISO-8859-1", low_memory=False)
print(df_B.shape)
print(df_B.columns)
print(df_B.head())


(10140228, 29)
Index(['Rndrng_NPI', 'Rndrng_Prvdr_Last_Org_Name', 'Rndrng_Prvdr_First_Name',
       'Rndrng_Prvdr_MI', 'Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr',
       'Rndrng_Prvdr_Ent_Cd', 'Rndrng_Prvdr_St1', 'Rndrng_Prvdr_St2',
       'Rndrng_Prvdr_City', 'Rndrng_Prvdr_State_Abrvtn',
       'Rndrng_Prvdr_State_FIPS', 'Rndrng_Prvdr_Zip5', 'Rndrng_Prvdr_RUCA',
       'Rndrng_Prvdr_RUCA_Desc', 'Rndrng_Prvdr_Cntry', 'Rndrng_Prvdr_Type',
       'Rndrng_Prvdr_Mdcr_Prtcptg_Ind', 'HCPCS_Cd', 'HCPCS_Desc',
       'HCPCS_Drug_Ind', 'Place_Of_Srvc', 'Tot_Benes', 'Tot_Srvcs',
       'Tot_Bene_Day_Srvcs', 'Avg_Sbmtd_Chrg', 'Avg_Mdcr_Alowd_Amt',
       'Avg_Mdcr_Pymt_Amt', 'Avg_Mdcr_Stdzd_Amt'],
      dtype='object')
   Rndrng_NPI Rndrng_Prvdr_Last_Org_Name Rndrng_Prvdr_First_Name  \
0  1003000126                  Enkeshafi                 Ardalan   
1  1003000126                  Enkeshafi                 Ardalan   
2  1003000126                  Enkeshafi                 Ardalan   
3  100300

In [13]:
#check the stats
print(df_B.info())
print(df_B.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10140228 entries, 0 to 10140227
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   Rndrng_NPI                     int64  
 1   Rndrng_Prvdr_Last_Org_Name     object 
 2   Rndrng_Prvdr_First_Name        object 
 3   Rndrng_Prvdr_MI                object 
 4   Rndrng_Prvdr_Crdntls           object 
 5   Rndrng_Prvdr_Gndr              object 
 6   Rndrng_Prvdr_Ent_Cd            object 
 7   Rndrng_Prvdr_St1               object 
 8   Rndrng_Prvdr_St2               object 
 9   Rndrng_Prvdr_City              object 
 10  Rndrng_Prvdr_State_Abrvtn      object 
 11  Rndrng_Prvdr_State_FIPS        float64
 12  Rndrng_Prvdr_Zip5              object 
 13  Rndrng_Prvdr_RUCA              float64
 14  Rndrng_Prvdr_RUCA_Desc         object 
 15  Rndrng_Prvdr_Cntry             object 
 16  Rndrng_Prvdr_Type              object 
 17  Rndrng_Prvdr_Mdcr_Prtcptg_Ind  object 
 18  

In [23]:
#import data set D
#description for D columns at https://data.cms.gov/resources/medicare-part-d-prescribers-by-provider-and-drug-data-dictionary
df_D = pd.read_csv('~/Documents/GitHub/medicare-fraud-detection/data/raw/CMS/formD_MUP_DPR_RY21_P04_V10_DY19_NPIBN_1.csv', encoding = "ISO-8859-1", low_memory=False)
print(df_D.shape)
print(df_D.columns)
print(df_D.head())

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
#check the stats
print(df_D.info())
print(df_D.describe())

In [24]:
#we create a dataframe with first column is NPI and second is PotentialFraud
df_Potential_Fraud = {"NPI":[],"Potential_Fraud":[]}
for i in label:
    df_Potential_Fraud['NPI'].append(i) 
    df_Potential_Fraud['Potential_Fraud'].append('Yes')
df_Potential_Fraud = pd.DataFrame(df_Potential_Fraud)

In [25]:
#join set B with df_Potential_Fraud
df_B = df_B.rename(columns={"Rndrng_NPI": "NPI"})
df_B_fraud = df_B.set_index('NPI').join(df_Potential_Fraud.set_index('NPI'), on = 'NPI', how = 'outer')

In [26]:
df_B_fraud

Unnamed: 0,NPI,Rndrng_Prvdr_Last_Org_Name,Rndrng_Prvdr_First_Name,Rndrng_Prvdr_MI,Rndrng_Prvdr_Crdntls,Rndrng_Prvdr_Gndr,Rndrng_Prvdr_Ent_Cd,Rndrng_Prvdr_St1,Rndrng_Prvdr_St2,Rndrng_Prvdr_City,...,HCPCS_Drug_Ind,Place_Of_Srvc,Tot_Benes,Tot_Srvcs,Tot_Bene_Day_Srvcs,Avg_Sbmtd_Chrg,Avg_Mdcr_Alowd_Amt,Avg_Mdcr_Pymt_Amt,Avg_Mdcr_Stdzd_Amt,Potential_Fraud
1.003000e+09,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,900 Seton Dr,,Cumberland,...,N,F,40.0,40.0,40.0,232.275000,72.59000,57.870000,58.19175,
1.003000e+09,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,900 Seton Dr,,Cumberland,...,N,F,25.0,25.0,25.0,712.800000,186.72520,148.851600,147.77000,
1.003000e+09,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,900 Seton Dr,,Cumberland,...,N,F,24.0,24.0,24.0,320.166667,101.12125,80.615833,80.81000,
1.003000e+09,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,900 Seton Dr,,Cumberland,...,N,F,157.0,158.0,158.0,651.417722,200.93000,160.190000,161.06000,
1.003000e+09,1003000126,Enkeshafi,Ardalan,,M.D.,M,I,900 Seton Dr,,Cumberland,...,N,F,117.0,205.0,205.0,245.614634,72.75639,58.005561,57.93000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,1053622035,,,,,,,,,,...,,,,,,,,,,Yes
,1124024435,,,,,,,,,,...,,,,,,,,,,Yes
,1194930149,,,,,,,,,,...,,,,,,,,,,Yes
,1578637385,,,,,,,,,,...,,,,,,,,,,Yes


In [35]:
sum(df_B_fraud['Potential_Fraud'] == 'Yes')

7534

In [None]:
print(df_B_fraud['Potential_Fraud'].value_counts())

In [None]:
#change Potential_Fraud to float
# df_B_fraud['Potential_Fraud'] = df_B_fraud['Potential_Fraud'].fillna(0)
# for i in df_B_fraud['Potential_Fraud']:
#     if i == 'Yes':
#         df_B_fraud['Potential_Fraud'] = 1
fraud_bool = df_B_fraud['Potential_Fraud'] == 'Yes'
df_B_fraud['Potential_Fraud_bool'] = fraud_bool
df_B_fraud

In [None]:
#create heatmap on data B
sns.heatmap(df_B_fraud.corr())
plt.show()

In [None]:
#Tot_Benes and Tot_Bene_Day_srvcs are highly correlated. So we drop. Similar for Avg_Mdcr_alowd, pymt and stdzd
to_drop_B = ['Tot_Bene_Day_Srvcs', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Avg_Mdcr_Stdzd_Amt', 'Potential_Fraud']
df_B = df_B_fraud.drop(labels = to_drop_B, axis=1)

In [None]:
print(df_B.info())

In [None]:
#now we group by NPI

In [None]:
#join set D with df_Potential_Fraud
df_D = df_D.rename(columns={"Prscrbr_NPI": "NPI"})
df_D_fraud = df_D.set_index('NPI').join(df_Potential_Fraud.set_index('NPI'), on = 'NPI', how = 'outer')

In [None]:
fraud_bool_D = df_D_fraud['Potential_Fraud'] == 'Yes'
df_D_fraud['Potential_Fraud_bool'] = fraud_bool_D
df_D_fraud

In [None]:
sns.heatmap(df_D_fraud.corr())
plt.show()
#same situation with B

In [None]:
to_drop_D = ['Prscrbr_Last_Org_Name', 'Prscrbr_First_Name', 'Potential_Fraud', 'GE65_Tot_Clms', 'GE65_Tot_30day_Fills', 'GE65_Tot_Drug_Cst', 'GE65_Tot_Day_Suply', 'GE65_Tot_Benes']
df_D = df_D_fraud.drop(labels = to_drop_D, axis=1)

In [None]:
df_D.columns

In [None]:
#join set DME with df_Potential_Fraud
df_mup_dme = df_mup_dme.rename(columns={"Rfrg_NPI": "NPI"})
df_mup_dme_fraud = df_mup_dme.set_index('NPI').join(df_Potential_Fraud.set_index('NPI'), on = 'NPI', how = 'outer')
df_mup_dme_fraud.info()

In [None]:
df_mup_dme.info()

In [None]:
fraud_bool_DME = df_mup_dme_fraud['Potential_Fraud'] == 'Yes'
df_mup_dme_fraud['Potential_Fraud_bool'] = fraud_bool_DME

In [None]:
sns.heatmap(df_mup_dme_fraud.corr())
plt.show()

In [None]:
df_mup_dme_fraud.columns

In [None]:
#now we consider some of the most important features according to Brendan, including total_claim_count_max, total_30_day_fill_count_std, bene_unique_cnt_mean


In [None]:
#we work on total claim for set D first
#print(df_D['Tot_Clms'], df_D['GE65_Tot_Clms'])
#rls of totalclaims and total claims for over 65?
plt.plot(df_D['Tot_Clms'], df_D['GE65_Tot_Clms'])
plt.show()

In [None]:
dfD = df_D_fraud.groupby('NPI')['Tot_Clms'].sum().reset_index(name = 'Tot_clm')
dfD = dfD.set_index('NPI').join(df_Potential_Fraud.set_index('NPI'), on = 'NPI', how = 'outer')
fraud_bool_ = dfD['Potential_Fraud'] == 'Yes'
dfD['Potential_Fraud_bool'] = fraud_bool_
dfD

In [None]:
plt.scatter(dfD['Tot_clm'], dfD['Potential_Fraud_bool'])
plt.show()

In [None]:
dat_B_agg = df_B.groupby(['NPI','Rndrng_Prvdr_Ent_Cd', 'Rndrng_Prvdr_Cntry']).agg(
                                                   {'Tot_Srvcs':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                    'Tot_Benes':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                    'Tot_Bene_Day_Srvcs':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                    'Avg_Sbmtd_Chrg':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                    'Avg_Mdcr_Pymt_Amt': ['sum', 'mean', 'median', np.std, 'min', 'max']
                                                 })
dat_B_agg.head()

In [None]:
dat_D_agg = df_D.groupby(['NPI','Prscrbr_Type']).agg(
                                                  {'Tot_Benes':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                   'Tot_Clms':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                   'Tot_30day_Fills':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                   'Tot_Day_Suply':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                   'Tot_Drug_Cst': ['sum', 'mean', 'median', np.std, 'min', 'max']
                                                  })
dat_D_agg.head()

In [None]:
dat_dem_agg = df_mup_dme.groupby(['NPI',]).agg({'Tot_Suplrs':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                       'Tot_Suplr_Benes':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                       'Tot_Suplr_Clms':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                       'Tot_Suplr_Srvcs':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                       'Avg_Suplr_Mdcr_Pymt_Amt':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                       'Avg_Suplr_Mdcr_Stdzd_Amt': ['sum', 'mean', 'median', np.std, 'min', 'max']
                                                 })
                                                                                    
dat_dem_agg.head()

In [None]:
merged_d = dat_dem_agg.reset_index().merge(dat_D_agg, how='inner', left_on=['NPI'], right_on=['NPI'])
merged_d = merged_d.merge(dat_B_agg, how='inner', left_on='NPI', right_on='NPI')

In [None]:
df_label_fraud = df_label[df_label['NPI'] != 0]
df_label_fraud = df_label_fraud.set_index('NPI')
print(df_label_fraud.shape)
print(df_label.shape)

In [None]:
df_label_fraud = df_label_fraud[~df_label_fraud.index.duplicated(keep='first')].reset_index('NPI')
pd.Series(df_label_fraud.NPI).is_unique

In [None]:
print(df_label_fraud.shape)

In [None]:
sns.set(rc={'figure.figsize':(12,8)},style='white')

ax=sns.countplot(x='STATE',data=df_label_fraud
              ,order=df_label_fraud.STATE.value_counts().iloc[:10].index)

plt.title('Top-10 States invloved in Healthcare Fraud')
    
plt.show()


In [None]:

df_label_fraud['city'] = df_label_fraud['CITY'] + ', ' + df_label_fraud['STATE']
print(df_label_fraud.city.value_counts())
#sns.set(rc={'figure.figsize':(12,8)},style='white')
ax=sns.countplot(x='city',data=df_label_fraud
             ,order=df_label_fraud.city.value_counts().iloc[:10].index)

plt.title('Top-10 Cities invloved in Healthcare Fraud')
    
plt.show()

In [None]:
df_label_fraud[df_label_fraud['CITY'] == 'MIAMI']

In [None]:
sns.set(rc={'figure.figsize':(12,8)},style='white')

ax=sns.countplot(x='GENERAL',data=df_label_fraud
              ,order=df_label_fraud.GENERAL.value_counts().iloc[:10].index)

plt.title('Top-10 States invloved in Healthcare Fraud')
    
plt.show()

print(df_label_fraud.GENERAL.value_counts())

In [None]:
sns.set(rc={'figure.figsize':(12,8)},style='white')

ax=sns.countplot(x='SPECIALTY',data=df_label_fraud
              ,order=df_label_fraud.SPECIALTY.value_counts().iloc[:10].index)

plt.title('Top-10 States invloved in Healthcare Fraud')
    
plt.show()

print(df_label_fraud.SPECIALTY.value_counts())