In [21]:
#importing libraries
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score,f1_score

In [2]:
# reading the dataset
insurance_fraud_df=pd.read_csv('insurance_claims.csv')
insurance_fraud_df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,17-10-2014,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,27-06-2006,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,06-09-2000,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,25-05-1990,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,06-06-2014,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [3]:
# checking whether dataset is imbalanced or not
insurance_fraud_df['fraud_reported'].value_counts()

N    753
Y    247
Name: fraud_reported, dtype: int64

In [4]:
#checking for missing values
insurance_fraud_df.isnull().sum()

months_as_customer             0
age                            0
policy_number                  0
policy_bind_date               0
policy_state                   0
policy_csl                     0
policy_deductable              0
policy_annual_premium          0
umbrella_limit                 0
insured_zip                    0
insured_sex                    0
insured_education_level        0
insured_occupation             0
insured_hobbies                0
insured_relationship           0
capital-gains                  0
capital-loss                   0
incident_date                  0
incident_type                  0
collision_type                 0
incident_severity              0
authorities_contacted          0
incident_state                 0
incident_city                  0
incident_location              0
incident_hour_of_the_day       0
number_of_vehicles_involved    0
property_damage                0
bodily_injuries                0
witnesses                      0
police_rep

In [5]:
#replacing special characters with NA values

insurance_fraud_df.replace(r'?',np.nan,inplace=True)
insurance_fraud_df.isnull().sum()

months_as_customer               0
age                              0
policy_number                    0
policy_bind_date                 0
policy_state                     0
policy_csl                       0
policy_deductable                0
policy_annual_premium            0
umbrella_limit                   0
insured_zip                      0
insured_sex                      0
insured_education_level          0
insured_occupation               0
insured_hobbies                  0
insured_relationship             0
capital-gains                    0
capital-loss                     0
incident_date                    0
incident_type                    0
collision_type                 178
incident_severity                0
authorities_contacted            0
incident_state                   0
incident_city                    0
incident_location                0
incident_hour_of_the_day         0
number_of_vehicles_involved      0
property_damage                360
bodily_injuries     

In [6]:
# finding missing values percentage of each column

na_df=pd.DataFrame({'col_name':insurance_fraud_df.columns,'na_count':insurance_fraud_df.isnull().sum(),
                    'na_percentage':insurance_fraud_df.isnull().sum()/insurance_fraud_df.shape[0]*100})

set(na_df['na_count'])
na_df=na_df.sort_values(by='na_percentage',ascending=False)
na_df.head()

Unnamed: 0,col_name,na_count,na_percentage
property_damage,property_damage,360,36.0
police_report_available,police_report_available,343,34.3
collision_type,collision_type,178,17.8
auto_model,auto_model,0,0.0
auto_make,auto_make,0,0.0


In [7]:
#list of unnecessary columns

cols_unique_same=[]
for col in insurance_fraud_df.columns:
    if insurance_fraud_df[col].nunique()==1 or insurance_fraud_df[col].nunique()==insurance_fraud_df.shape[0]:
        cols_unique_same.append(col)
        
cols_unique_same.append('policy_bind_date')
cols_unique_same.append('incident_date')
cols_unique_same

['policy_number', 'incident_location', 'policy_bind_date', 'incident_date']

In [8]:
# dropping unnecessary columns

insurance_fraud_df.drop(columns=cols_unique_same, inplace=True)


In [9]:
# assigning output  variable to an object and further dropping the output column

y=insurance_fraud_df['fraud_reported']

insurance_fraud_df.drop(columns='fraud_reported',inplace=True)



In [10]:
# mapping N to 0 and Y to 1 because in further steps while calculating R2 score Y and N are not accepted
y=y.map({'N':0 ,'Y':1})
set(y)


{0, 1}

In [11]:
#y
# insurance_fraud_df.dtypes

In [12]:
# separating categorical and continuous columns
cat_cols=[col for col in insurance_fraud_df.columns if insurance_fraud_df[col].dtype=='object']
num_cols=[col for col in insurance_fraud_df.columns if insurance_fraud_df[col].dtype=='int64' or insurance_fraud_df[col].dtype=='float64']


In [13]:
#anova

def anova(col):
    categories_list=list(y.value_counts().index)
    res=f_oneway(*(insurance_fraud_df[col][y==cat] for cat in categories_list))
    return res[1]

anova_p_value=list(anova(col) for col in num_cols)
anova_p_value_df=pd.DataFrame({'column_name':num_cols,'p_value':anova_p_value})
anova_p_value_df
list(anova_p_value_df[anova_p_value_df['p_value']<=0.05]['column_name'])

['total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim']

In [14]:
# chi2

p_values=[]

for col in cat_cols:
    contingency_table=pd.crosstab(insurance_fraud_df[col],y)
    p_values.append(chi2_contingency(contingency_table)[1])

chi2_df=pd.DataFrame({'col_name':cat_cols,'p_value':p_values})
list(chi2_df[chi2_df['p_value']<=0.05]['col_name'])


['insured_hobbies',
 'incident_type',
 'incident_severity',
 'authorities_contacted',
 'incident_state']

In [15]:
# Train-test-split
X_train,X_test,y_train,y_test=train_test_split(insurance_fraud_df,y,test_size=0.2,random_state=45)

In [16]:
# Filling missing values

for col in num_cols:
    X_train[col]=X_train[col].fillna(X_train[col].mean())
    X_test[col]=X_test[col].fillna(X_test[col].mean())

for col in cat_cols:
    X_train[col]=X_train[col].fillna(X_train[col].mode()[0])
    X_test[col]=X_test[col].fillna(X_test[col].mode()[0])

In [17]:
# scaling continuous columns

std_sclr=StandardScaler()
for col in num_cols:
    X_train[col]= std_sclr.fit_transform(np.array(X_train[col]).reshape(-1,1))
    X_test[col]=std_sclr.transform(np.array(X_test[col]).reshape(-1,1))

In [18]:
# one hot encoding

one_train=pd.get_dummies(X_train[cat_cols])
one_test=pd.get_dummies(X_test[cat_cols])
one_train_final,one_test_final=one_train.align(one_test,join='inner',axis=1,fill_value=0)
X_train_final=pd.concat([X_train[num_cols],one_train_final],axis=1)
X_test_final=pd.concat([X_test[num_cols],one_test_final],axis=1)

In [24]:
#Logistic Regression

log_reg=LogisticRegression()
log_reg.fit(X_train_final,y_train)
pred=log_reg.predict(X_test_final)
pred
print('Train score:',log_reg.score(X_train_final,y_train))
print('Test score:',log_reg.score(X_test_final,y_test))
f1_score(y_test,pred)

Train score: 0.90375
Test score: 0.81


0.5681818181818182

In [52]:
# Decisin Tree Classifier

from sklearn.tree import DecisionTreeClassifier
dcsn_tree_clsfr=DecisionTreeClassifier()

dcsn_tree_clsfr.fit(X_train_final,y_train)

print('Train score:',dcsn_tree_clsfr.score(X_train_final,y_train))
print('Test score:',dcsn_tree_clsfr.score(X_test_final,y_test))

pred_dcn_tree=dcsn_tree_clsfr.predict(X_test_final)
f1_score(y_test,pred_dcn_tree)

Train score: 1.0
Test score: 0.77


0.54