In [1]:
import pandas as pd
import numpy as np

In [3]:
raw_data = pd.read_csv('Insurance.csv') # reading a csv file
raw_data.head()

Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,target
0,110936,0.429,12058,355060,0.0,0.0,0.0,99.02,13,C,Urban,1
1,41492,0.01,21546,315150,0.0,0.0,0.0,99.89,21,A,Urban,1
2,31300,0.917,17531,84140,2.0,3.0,1.0,98.69,7,C,Rural,0
3,19415,0.049,15341,250510,0.0,0.0,0.0,99.57,9,A,Urban,1
4,99379,0.052,31400,198680,0.0,0.0,0.0,99.87,12,B,Urban,1


In [4]:
train_data = raw_data.copy(deep=True) 

# Droping id as it is not usefull in prediction
train_data.drop(['id'] , axis = 1 , inplace=True)
train_data.head()

Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,target
0,0.429,12058,355060,0.0,0.0,0.0,99.02,13,C,Urban,1
1,0.01,21546,315150,0.0,0.0,0.0,99.89,21,A,Urban,1
2,0.917,17531,84140,2.0,3.0,1.0,98.69,7,C,Rural,0
3,0.049,15341,250510,0.0,0.0,0.0,99.57,9,A,Urban,1
4,0.052,31400,198680,0.0,0.0,0.0,99.87,12,B,Urban,1


In [6]:
new_features = ['percent_paid' , 'days' , 'income' , '3-6_months' , '6-12_months' , 'more_than_12_months' , 
                'writing_score' , 'paid_premiums' , 'channel' , 'area' , 'target']

In [7]:
train_data.columns = new_features

In [8]:
from sklearn.impute import KNNImputer

# Creating a function to compute missing values
def compute_missing_values(dataset):
    numeric_data = dataset._get_numeric_data()
    # Chice number of neighbours as 2
    imputer = KNNImputer(n_neighbors=2)
    # Fit the model on our dataset
    imputer.fit(numeric_data)
    # Transform the dataset
    data_imputed = imputer.fit_transform(numeric_data)
    data_no = pd.DataFrame(data_imputed , columns = numeric_data.columns.values)
    return pd.concat([data_no, dataset.select_dtypes(include='object')], axis=1)

train_no_missing = compute_missing_values(train_data)
train_no_missing.head()

Unnamed: 0,percent_paid,days,income,3-6_months,6-12_months,more_than_12_months,writing_score,paid_premiums,target,channel,area
0,0.429,12058.0,355060.0,0.0,0.0,0.0,99.02,13.0,1.0,C,Urban
1,0.01,21546.0,315150.0,0.0,0.0,0.0,99.89,21.0,1.0,A,Urban
2,0.917,17531.0,84140.0,2.0,3.0,1.0,98.69,7.0,0.0,C,Rural
3,0.049,15341.0,250510.0,0.0,0.0,0.0,99.57,9.0,1.0,A,Urban
4,0.052,31400.0,198680.0,0.0,0.0,0.0,99.87,12.0,1.0,B,Urban


In [9]:
train_no_missing.isna().sum()

percent_paid           0
days                   0
income                 0
3-6_months             0
6-12_months            0
more_than_12_months    0
writing_score          0
paid_premiums          0
target                 0
channel                0
area                   0
dtype: int64

In [12]:
dataset = train_no_missing.copy()
dataset.loc[ dataset['income'] <= 23603.99, 'income'] = 0
dataset.loc[(dataset['income'] > 23603.99) & (dataset['income'] <= 109232.0), 'income'] = 1
dataset.loc[(dataset['income'] > 109232.0) & (dataset['income'] <= 194434.0), 'income'] = 2
dataset.loc[(dataset['income'] > 194434.0) & (dataset['income'] <= 279636.0), 'income'] = 3
dataset.loc[(dataset['income'] > 279636.0) & (dataset['income'] <= 364838.0), 'income'] = 4
dataset.loc[(dataset['income'] > 364838.0) & (dataset['income'] <= 450040.0), 'income'] = 5
dataset.loc[ dataset['income'] > 450040.0, 'income'] = 6

In [13]:
dataset['area'] = dataset['area'].map( {'Urban' : 1, 'Rural' : 0} )
dataset['channel'] = dataset['channel'].map( {'A' : 0, 'B' : 1, 'C' : 2, 'D' : 3, 'E' : 4} )


In [14]:
train_modified = dataset.copy()  

In [16]:
print(train_modified.area.unique())
print(train_modified.channel.unique())

[1 0]
[2 0 1 3 4]


In [17]:
X = train_modified.drop(['target'] , axis = 1)
y = train_modified['target']

In [20]:
X.set_index('percent_paid').to_csv('scale.csv')

In [21]:
# We will standardize the data using Minmaxscaler
from sklearn.preprocessing import MinMaxScaler

# Creating Object of Minmax scaler
scaler = MinMaxScaler()

scaler.fit(X)

MinMaxScaler()

In [22]:
X = scaler.fit_transform(X)

In [23]:
from sklearn.model_selection import train_test_split
# let us now split the dataset into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=10)

# print the shape of 'x_train'
print("X_train ",X_train.shape)

# print the shape of 'x_test'
print("X_test ",X_test.shape)

# print the shape of 'y_train'
print("y_train ",y_train.shape)

# print the shape of 'y_test'
print("y_test ",y_test.shape)

X_train  (55897, 10)
X_test  (23956, 10)
y_train  (55897,)
y_test  (23956,)


In [24]:
# Build Model

# Bagging Classifier
from sklearn.ensemble import BaggingClassifier
from sklearn import tree

# build the model
meta_estimator = BaggingClassifier(tree.DecisionTreeClassifier(random_state=10))

# fit the model
meta_estimator.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=10))

In [25]:
y_pred = meta_estimator.predict(X_test)

In [26]:
from sklearn import metrics

# create a list of column names
cols = ['Model', 'AUC Score', 'Precision Score', 'Recall Score','Accuracy Score','f1-score']

# creating an empty dataframe of the colums
result_tabulation = pd.DataFrame(columns = cols)

# compiling the required information
Bagging_Meta_estimator = pd.Series({'Model': "Bagging Meta-estimator",
                     'AUC Score' : metrics.roc_auc_score(y_test, y_pred),
                 'Precision Score': metrics.precision_score(y_test, y_pred),
                 'Recall Score': metrics.recall_score(y_test, y_pred),
                 'Accuracy Score': metrics.accuracy_score(y_test, y_pred),
                  'f1-score':metrics.f1_score(y_test, y_pred)})



# appending our result table
result_tabulation = result_tabulation.append(Bagging_Meta_estimator , ignore_index = True)

# view the result table
result_tabulation

Unnamed: 0,Model,AUC Score,Precision Score,Recall Score,Accuracy Score,f1-score
0,Bagging Meta-estimator,0.590757,0.948397,0.978848,0.930247,0.963382


In [27]:
# Adaboost
from sklearn.ensemble import AdaBoostClassifier

# build the model
adaboost = AdaBoostClassifier(random_state=10)
# fit the model
adaboost.fit(X_train, y_train)

AdaBoostClassifier(random_state=10)

In [28]:
# predict the values
y_pred_adaboost  = adaboost.predict(X_test)

In [29]:
adaboost_metrics = pd.Series({'Model': "AdaBoost",
                     'AUC Score' : metrics.roc_auc_score(y_test, y_pred_adaboost),
                 'Precision Score': metrics.precision_score(y_test, y_pred_adaboost),
                 'Recall Score': metrics.recall_score(y_test, y_pred_adaboost),
                 'Accuracy Score': metrics.accuracy_score(y_test, y_pred_adaboost),
                  'f1-score':metrics.f1_score(y_test, y_pred_adaboost)})



# appending our result table
result_tabulation = result_tabulation.append(adaboost_metrics , ignore_index = True)

# view the result table
result_tabulation

Unnamed: 0,Model,AUC Score,Precision Score,Recall Score,Accuracy Score,f1-score
0,Bagging Meta-estimator,0.590757,0.948397,0.978848,0.930247,0.963382
1,AdaBoost,0.570103,0.945753,0.992207,0.939347,0.968423


In [30]:
# XGboost
#import xgboost classifier
from xgboost.sklearn import XGBClassifier

# build the model
xgbm = XGBClassifier(random_state=1,learning_rate=0.01)
# fit the model
xgbm.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [31]:
# predict the values
y_pred_xgbm  = xgbm.predict(X_test)


In [32]:
# compiling the required information
xgbm_metrices = pd.Series({'Model': "XGBM",
                     'AUC Score' : metrics.roc_auc_score(y_test, y_pred_xgbm),
                 'Precision Score': metrics.precision_score(y_test, y_pred_xgbm),
                 'Recall Score': metrics.recall_score(y_test, y_pred_xgbm),
                 'Accuracy Score': metrics.accuracy_score(y_test, y_pred_xgbm),

                  'f1-score':metrics.f1_score(y_test, y_pred_xgbm)})



# appending our result table
result_tabulation = result_tabulation.append(xgbm_metrices , ignore_index = True)

# view the result table
result_tabulation

Unnamed: 0,Model,AUC Score,Precision Score,Recall Score,Accuracy Score,f1-score
0,Bagging Meta-estimator,0.590757,0.948397,0.978848,0.930247,0.963382
1,AdaBoost,0.570103,0.945753,0.992207,0.939347,0.968423
2,XGBM,0.552262,0.943591,0.995191,0.939723,0.968704


In [33]:
# LOgistic Regression
from sklearn.linear_model import LogisticRegression

# Create logistic regression
logistic = LogisticRegression()

logistic.fit(X_train,y_train)


LogisticRegression()

In [34]:
# predict the values
y_pred = logistic.predict(X_test)

In [35]:
# compiling the required information
logisitc = pd.Series({'Model': "Logistic Regression",
                     'AUC Score' : metrics.roc_auc_score(y_test, y_pred),
                 'Precision Score': metrics.precision_score(y_test, y_pred),
                 'Recall Score': metrics.recall_score(y_test, y_pred),
                 'Accuracy Score': metrics.accuracy_score(y_test, y_pred),
                  'f1-score':metrics.f1_score(y_test, y_pred)})



# appending our result table
result_tabulation = result_tabulation.append(logisitc , ignore_index = True)

# view the result table
result_tabulation


Unnamed: 0,Model,AUC Score,Precision Score,Recall Score,Accuracy Score,f1-score
0,Bagging Meta-estimator,0.590757,0.948397,0.978848,0.930247,0.963382
1,AdaBoost,0.570103,0.945753,0.992207,0.939347,0.968423
2,XGBM,0.552262,0.943591,0.995191,0.939723,0.968704
3,Logistic Regression,0.546395,0.942892,0.99479,0.938638,0.968146


In [36]:
from sklearn.naive_bayes import GaussianNB

# build the model
GNB = GaussianNB()

# fit the model
GNB.fit(X_train, y_train)

GaussianNB()

In [37]:
# predict the values
y_pred_GNB  = GNB.predict(X_test)

In [38]:
# compiling the required information
GNB_metrices = pd.Series({'Model': "Naive Bayes",
                     'AUC Score' : metrics.roc_auc_score(y_test, y_pred_GNB),
                 'Precision Score': metrics.precision_score(y_test, y_pred_GNB),
                 'Recall Score': metrics.recall_score(y_test, y_pred_GNB),
                 'Accuracy Score': metrics.accuracy_score(y_test, y_pred_GNB),

                  'f1-score':metrics.f1_score(y_test, y_pred_GNB)})



# appending our result table
result_tabulation = result_tabulation.append(GNB_metrices , ignore_index = True)

# view the result table

result_tabulation


Unnamed: 0,Model,AUC Score,Precision Score,Recall Score,Accuracy Score,f1-score
0,Bagging Meta-estimator,0.590757,0.948397,0.978848,0.930247,0.963382
1,AdaBoost,0.570103,0.945753,0.992207,0.939347,0.968423
2,XGBM,0.552262,0.943591,0.995191,0.939723,0.968704
3,Logistic Regression,0.546395,0.942892,0.99479,0.938638,0.968146
4,Naive Bayes,0.683081,0.960647,0.946829,0.9138,0.953688


In [39]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
#create new a knn model
knn = KNeighborsClassifier(n_neighbors=5)

#fit model to data
knn.fit(X_train, y_train)



KNeighborsClassifier()

In [40]:
# predict the values
y_pred_knn  = knn.predict(X_test)


In [41]:
# compiling the required information
knn_metrics = pd.Series({'Model': "KNN",
                     'AUC Score' : metrics.roc_auc_score(y_test, y_pred_knn),
                 'Precision Score': metrics.precision_score(y_test, y_pred_knn),
                 'Recall Score': metrics.recall_score(y_test, y_pred_knn),
                 'Accuracy Score': metrics.accuracy_score(y_test, y_pred_knn),
                  'f1-score':metrics.f1_score(y_test, y_pred_knn)})



# appending our result table
result_tabulation = result_tabulation.append(knn_metrics , ignore_index = True)

# view the result table
result_tabulation

Unnamed: 0,Model,AUC Score,Precision Score,Recall Score,Accuracy Score,f1-score
0,Bagging Meta-estimator,0.590757,0.948397,0.978848,0.930247,0.963382
1,AdaBoost,0.570103,0.945753,0.992207,0.939347,0.968423
2,XGBM,0.552262,0.943591,0.995191,0.939723,0.968704
3,Logistic Regression,0.546395,0.942892,0.99479,0.938638,0.968146
4,Naive Bayes,0.683081,0.960647,0.946829,0.9138,0.953688
5,KNN,0.545483,0.94279,0.993632,0.93751,0.967543


In [42]:
# Max AUC Score is achieved by Naive Bayes
# So selecting Naive Bayes
import pickle

In [45]:
pickle.dump(GNB,open('insurance.pkl','wb'))