In [2]:
import pandas as pd 
import numpy as np 
import plotly.io as pio
import plotly.express as px
pd.options.plotting.backend = "plotly"

In [5]:
df = pd.read_csv('~/Desktop/final.csv')

In [6]:
df['lof'].replace({0:'0',-1:'-1',1:'1'},inplace=True)

In [7]:
real = df[df['lof']=='1']
potential = df[df['lof']=='0']
non = df[df['lof']=='-1']

In [8]:
inpatient = df[df['Is_inpatient'] == 1]
outpatient = df[df['Is_inpatient'] == 0]

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558211 entries, 0 to 558210
Data columns (total 27 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   Unnamed: 0                        558211 non-null  int64 
 1   AmtReimbursed                     558211 non-null  int64 
 2   DeductibleAmt                     558211 non-null  int64 
 3   Is_inpatient                      558211 non-null  int64 
 4   Gender                            558211 non-null  int64 
 5   Race                              558211 non-null  int64 
 6   RenalDisease                      558211 non-null  int64 
 7   State                             558211 non-null  int64 
 8   County                            558211 non-null  int64 
 9   InpatientAnnualReimbursementAmt   558211 non-null  int64 
 10  InpatientAnnualDeductibleAmt      558211 non-null  int64 
 11  OutpatientAnnualReimbursementAmt  558211 non-null  int64 
 12  Ou

## Analysis on Reim

after xgboost model

In [9]:
amount= [(df['AmtReimbursed']<=300), 
         ((df['AmtReimbursed']>300)&(df['AmtReimbursed']<=1000)),
        ((df['AmtReimbursed']>1000)&(df['AmtReimbursed']<=8000)),
        ((df['AmtReimbursed']>8000)&(df['AmtReimbursed']<=20000)),
        (df['AmtReimbursed']>20000)]

In [10]:
level = ['under 300', '300-1k', '1k-8k', '8k-20k', '20k+']

In [11]:
df['Reimbursed_Level'] = np.select(amount, level)

In [23]:
reim = df.groupby(['Reimbursed_Level', 'lof']).count().reset_index()
reim= reim.pivot(index='Reimbursed_Level', columns='lof',values='AmtReimbursed').reset_index().reindex([4,2,0,3,1])
reim = reim.fillna(0)
reim['Fraud'] = 100*reim['1']/(reim['0']+reim['1']+reim['-1'])
reim['Potential_Fraud'] = 100*reim['0']/(reim['0']+reim['1']+reim['-1'])
reim['non_Fraud'] = 100*reim['-1']/(reim['0']+reim['1']+reim['-1'])

In [87]:
fig1 = reim.plot.bar(x='Reimbursed_Level', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'], 
                    title="Overall Reimbursed Level",
                    color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,
                    labels={
                     "value": "percentage"
                 })
fig1.show()

In [25]:
in_reim = inpatient.groupby(['Reimbursed_Level', 'lof']).count().reset_index()
in_reim= in_reim.pivot(index='Reimbursed_Level', columns='lof',values='AmtReimbursed').reset_index().reindex([4,2,0,3,1])
in_reim = in_reim.fillna(0)
in_reim['Fraud'] = 100*in_reim['1']/(in_reim['0']+in_reim['1']+in_reim['-1'])
in_reim['Potential_Fraud'] = 100*in_reim['0']/(in_reim['0']+in_reim['1']+in_reim['-1'])
in_reim['non_Fraud'] = 100*in_reim['-1']/(in_reim['0']+in_reim['1']+in_reim['-1'])

In [88]:
fig2 = in_reim.plot.bar(x='Reimbursed_Level', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                       title="Inpatient Reimbursed Level",
                       color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,
                       labels={"value": "percentage"})
fig2.show()

For inpatient claims, lower reimbursed level show a higher prob to be fraud

In [47]:
out_reim = outpatient.groupby(['Reimbursed_Level', 'lof']).count().reset_index()
out_reim= out_reim.pivot(index='Reimbursed_Level', columns='lof',values='AmtReimbursed').reset_index().reindex([4,2,0,3,1])
out_reim = out_reim.fillna(0)
out_reim['Fraud'] = 100*out_reim['1']/(out_reim['0']+out_reim['1']+out_reim['-1'])
out_reim['Potential_Fraud'] = 100*out_reim['0']/(out_reim['0']+out_reim['1']+out_reim['-1'])
out_reim['non_Fraud'] = 100*out_reim['-1']/(out_reim['0']+out_reim['1']+out_reim['-1'])

In [89]:
fig3 = out_reim.plot.bar(x='Reimbursed_Level', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                         title="Outpatient Reimbursed Level",
                        color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,labels={"value": "percentage"})
fig3.show()

## Analysis on Number of Physicians that in charge of the claim

### No obvious pattern

In [29]:
physNum = df.groupby(['PhysiciansNum', 'lof']).count().reset_index()
physNum = physNum.pivot(index='PhysiciansNum', columns='lof', values='Is_inpatient').reset_index()
physNum = physNum.fillna(0)
physNum['Fraud'] = 100*physNum['1']/(physNum['0']+physNum['1']+physNum['-1'])
physNum['Potential_Fraud'] = 100*physNum['0']/(physNum['0']+physNum['1']+physNum['-1'])
physNum['non_Fraud'] = 100*physNum['-1']/(physNum['0']+physNum['1']+physNum['-1'])

In [90]:
fig4 = physNum.plot.bar(x='PhysiciansNum', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                       title="Overall Number of Physicians",
                       color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,labels={"value": "percentage"})
fig4.update_xaxes(type='category')
fig4.show()

In [33]:
in_physNum = inpatient.groupby(['PhysiciansNum', 'lof']).count().reset_index()
in_physNum = in_physNum.pivot(index='PhysiciansNum', columns='lof', values='Is_inpatient').reset_index()
in_physNum = in_physNum.fillna(0)
in_physNum['Fraud'] = 100*in_physNum['1']/(in_physNum['0']+in_physNum['1']+in_physNum['-1'])
in_physNum['Potential_Fraud'] = 100*in_physNum['0']/(in_physNum['0']+in_physNum['1']+in_physNum['-1'])
in_physNum['non_Fraud'] = 100*in_physNum['-1']/(in_physNum['0']+in_physNum['1']+in_physNum['-1'])

In [91]:
fig5 = in_physNum.plot.bar(x='PhysiciansNum', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                           title="Inpatient Number of Physicians",
                          color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,labels={"value": "percentage"})
fig5.update_xaxes(type='category')
fig5.show()

In [35]:
out_physNum = outpatient.groupby(['PhysiciansNum', 'lof']).count().reset_index()
out_physNum = out_physNum.pivot(index='PhysiciansNum', columns='lof', values='Is_inpatient').reset_index()
out_physNum = out_physNum.fillna(0)
out_physNum['Fraud'] = 100*out_physNum['1']/(out_physNum['0']+out_physNum['1']+out_physNum['-1'])
out_physNum['Potential_Fraud'] = 100*out_physNum['0']/(out_physNum['0']+out_physNum['1']+out_physNum['-1'])
out_physNum['non_Fraud'] = 100*out_physNum['-1']/(out_physNum['0']+out_physNum['1']+out_physNum['-1'])

In [92]:
fig6 = out_physNum.plot.bar(x='PhysiciansNum', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                           title="Outpatient Number of Physicians",
                            color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,labels={"value": "percentage"})
fig6.update_xaxes(type='category')
fig6.show()

## Analysis on number of procedure codes a claim has

Weird outpatient plot

In [37]:
pro= df.groupby(['ProcedureCode_Num', 'lof']).count().reset_index()
pro = pro.pivot(index='ProcedureCode_Num', columns='lof', values='Is_inpatient').reset_index()
pro = pro.fillna(0)
pro['Fraud'] = 100*pro['1']/(pro['0']+pro['1']+pro['-1'])
pro['Potential_Fraud'] = 100*pro['0']/(pro['0']+pro['1']+pro['-1'])
pro['non_Fraud'] = 100*pro['-1']/(pro['0']+pro['1']+pro['-1'])

In [93]:
fig7 = pro.plot.bar(x='ProcedureCode_Num', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                    title="Overall Number of ProcedureCode",
                   color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,labels={"value": "percentage"})

fig7.show()

In [39]:
in_pro= inpatient.groupby(['ProcedureCode_Num', 'lof']).count().reset_index()
in_pro = in_pro.pivot(index='ProcedureCode_Num', columns='lof', values='Is_inpatient').reset_index()
in_pro = in_pro.fillna(0)
in_pro['Fraud'] = 100*in_pro['1']/(in_pro['0']+in_pro['1']+in_pro['-1'])
in_pro['Potential_Fraud'] = 100*in_pro['0']/(in_pro['0']+in_pro['1']+in_pro['-1'])
in_pro['non_Fraud'] = 100*in_pro['-1']/(in_pro['0']+in_pro['1']+in_pro['-1'])

In [94]:
fig8 = in_pro.plot.bar(x='ProcedureCode_Num', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                      title="Inpatient Number of ProcedureCode",
                      color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,labels={"value": "percentage"})

fig8.show()

In [41]:
out_pro= outpatient.groupby(['ProcedureCode_Num', 'lof']).count().reset_index()
out_pro = out_pro.pivot(index='ProcedureCode_Num', columns='lof', values='Is_inpatient').reset_index()
out_pro = out_pro.fillna(0)
out_pro['Fraud'] = 100*out_pro['1']/(out_pro['0']+out_pro['1']+out_pro['-1'])
out_pro['Potential_Fraud'] = 100*out_pro['0']/(out_pro['0']+out_pro['1']+out_pro['-1'])
out_pro['non_Fraud'] = 100*out_pro['-1']/(out_pro['0']+out_pro['1']+out_pro['-1'])

In [95]:
fig9 = out_pro.plot.bar(x='ProcedureCode_Num', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                        color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,
                       title="Outpatient Number of ProcedureCode",labels={"value": "percentage"})

fig9.show()

In [None]:
more than 3 proc code less likely to be fraud 

# outpatientAnnualDeductibleAmy

In [64]:
annual_deduc= df.groupby(['OutpatientAnnualDeductibleAmt', 'lof']).count().reset_index()
annual_deduc = annual_deduc.pivot(index='OutpatientAnnualDeductibleAmt', columns='lof', values='Is_inpatient').reset_index()
annual_deduc = annual_deduc.fillna(0)
annual_deduc['Fraud'] = 100*annual_deduc['1']/(annual_deduc['0']+annual_deduc['1']+annual_deduc['-1'])
annual_deduc['Potential_Fraud'] = 100*annual_deduc['0']/(annual_deduc['0']+annual_deduc['1']+annual_deduc['-1'])
annual_deduc['non_Fraud'] = 100*annual_deduc['-1']/(annual_deduc['0']+annual_deduc['1']+annual_deduc['-1'])

In [96]:
fig10 = annual_deduc.plot.bar(x='OutpatientAnnualDeductibleAmt', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                        color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,
                       title="Overall OutpatientAnnualDeductibleAmt",labels={"value": "percentage"})
fig10.update_xaxes(type='category')
fig10.show()

In [68]:
in_annual_deduc= inpatient.groupby(['OutpatientAnnualDeductibleAmt', 'lof']).count().reset_index()
in_annual_deduc = in_annual_deduc.pivot(index='OutpatientAnnualDeductibleAmt', columns='lof', values='Is_inpatient').reset_index()
in_annual_deduc = in_annual_deduc.fillna(0)
in_annual_deduc['Fraud'] = 100*in_annual_deduc['1']/(in_annual_deduc['0']+in_annual_deduc['1']+in_annual_deduc['-1'])
in_annual_deduc['Potential_Fraud'] = 100*in_annual_deduc['0']/(in_annual_deduc['0']+in_annual_deduc['1']+in_annual_deduc['-1'])
in_annual_deduc['non_Fraud'] = 100*in_annual_deduc['-1']/(in_annual_deduc['0']+in_annual_deduc['1']+in_annual_deduc['-1'])

In [97]:
fig11 = in_annual_deduc.plot.bar(x='OutpatientAnnualDeductibleAmt', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                        color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,
                       title="Inpatient OutpatientAnnualDeductibleAmt",labels={"value": "percentage"})
fig11.update_xaxes(type='category')
fig11.show()

In [70]:
out_annual_deduc = outpatient.groupby(['OutpatientAnnualDeductibleAmt', 'lof']).count().reset_index()
out_annual_deduc = out_annual_deduc.pivot(index='OutpatientAnnualDeductibleAmt', columns='lof', values='Is_inpatient').reset_index()
out_annual_deduc = out_annual_deduc.fillna(0)
out_annual_deduc['Fraud'] = 100*out_annual_deduc['1']/(out_annual_deduc['0']+out_annual_deduc['1']+out_annual_deduc['-1'])
out_annual_deduc['Potential_Fraud'] = 100*out_annual_deduc['0']/(out_annual_deduc['0']+out_annual_deduc['1']+out_annual_deduc['-1'])
out_annual_deduc['non_Fraud'] = 100*out_annual_deduc['-1']/(out_annual_deduc['0']+out_annual_deduc['1']+out_annual_deduc['-1'])

In [101]:
fig12 = out_annual_deduc.plot.bar(x='OutpatientAnnualDeductibleAmt', y=['Fraud',
                                            'Potential_Fraud',
                                            'non_Fraud'],
                        color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Potential_Fraud': '#FF8C00',
                                           'Fraud':'#EF553B'}, width=600, height=500,
                       title="Outpatient OutpatientAnnualDeductibleAmt",labels={"value": "percentage"})
fig12.update_xaxes(type='category')
fig12.show()

## Analysis on claim numbers submitted by physicain 

In [60]:
full_claims = pd.read_csv('~/Desktop/full_claims.csv')
full_claims

Unnamed: 0,Group,AttendingPhysician,claim_count,fraud_phys,non_Fraud,Fraud
0,less than 20 claims,77452,210,18862,75.646852,24.353148
1,between 20 and 50,2769,1065,904,67.352835,32.647165
2,between 50 and 200,1630,16561,710,56.441718,43.558282
3,between 200 and 500,155,20769,73,52.903226,47.096774
4,more than 500 claims,57,34709,43,24.561404,75.438596


In [99]:
px.bar(full_claims, x="Group", y=["Fraud", "non_Fraud"],
       color_discrete_map={'non_Fraud':'#635EFA', 'Fraud': '#FF8C00'},
       width=600, height=500,labels={"value": "percentage"})

In [75]:
state_phys = pd.read_csv('~/Desktop/state_phys.csv')

In [100]:
state_phys['Fraud'] = 100*state_phys['1']/(state_phys['0']+state_phys['1'])
state_phys['non_Fraud'] = 100*state_phys['0']/(state_phys['0']+state_phys['1'])
px.bar(state_phys, x='state_unique', y=['Fraud','non_Fraud'],color_discrete_map={'non_Fraud':'#635EFA', 
                                                   'Fraud': '#FF8C00'},labels={"value": "percentage"})