In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
0,19,0,27.9,0,1,3,16884.924,1
1,18,1,33.77,1,0,2,1725.5523,1
2,28,1,33.0,3,0,2,4449.462,0
3,33,1,22.705,0,0,1,21984.47061,0
4,32,1,28.88,0,0,1,3866.8552,1


In [3]:
df.shape

(1338, 8)

In [4]:
df.isnull().sum()

age               0
sex               0
bmi               0
children          0
smoker            0
region            0
charges           0
insuranceclaim    0
dtype: int64

In [5]:
df.dtypes

age                 int64
sex                 int64
bmi               float64
children            int64
smoker              int64
region              int64
charges           float64
insuranceclaim      int64
dtype: object

In [6]:
df.duplicated().sum()

1

In [7]:
df=df.drop_duplicates()

In [8]:
for col in ['sex',	'children',	'smoker',	'region', 'insuranceclaim']:
    print(col)
    print(df[col].value_counts())
    print()

sex
1    675
0    662
Name: sex, dtype: int64

children
0    573
1    324
2    240
3    157
4     25
5     18
Name: children, dtype: int64

smoker
0    1063
1     274
Name: smoker, dtype: int64

region
2    364
3    325
1    324
0    324
Name: region, dtype: int64

insuranceclaim
1    782
0    555
Name: insuranceclaim, dtype: int64



In [9]:
X = df.drop(columns=['insuranceclaim'])
Y = df['insuranceclaim']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((935, 7), (402, 7), (935,), (402,))

In [10]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [11]:
def create_model(model):
    model.fit(X_train,Y_train)
    Y_predict= model.predict(X_test)
    print(confusion_matrix(Y_test, Y_predict))
    print()
    print(classification_report(Y_test, Y_predict))

In [12]:
lor = LogisticRegression()
create_model(lor)

[[147  25]
 [ 21 209]]

              precision    recall  f1-score   support

           0       0.88      0.85      0.86       172
           1       0.89      0.91      0.90       230

    accuracy                           0.89       402
   macro avg       0.88      0.88      0.88       402
weighted avg       0.89      0.89      0.89       402



In [13]:
dtc=DecisionTreeClassifier(random_state=1) 
create_model(dtc)

[[168   4]
 [  9 221]]

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       172
           1       0.98      0.96      0.97       230

    accuracy                           0.97       402
   macro avg       0.97      0.97      0.97       402
weighted avg       0.97      0.97      0.97       402



In [14]:
dict = {'Columns':X.columns, 'IG': (dtc.feature_importances_)}
df_feature=pd.DataFrame(dict)
df_feature=df_feature.sort_values('IG', ascending = False, ignore_index=True)
df_feature

Unnamed: 0,Columns,IG
0,children,0.37755
1,bmi,0.330158
2,smoker,0.18086
3,age,0.082568
4,charges,0.028864
5,sex,0.0
6,region,0.0
