# Exploratory Data Analysis (EDA)

In [1]:
# Importing pandas
import pandas as pd

# Load dataset into a pandas dataframe 
df = pd.read_csv("car_insurance.csv")
df

Unnamed: 0,age,driving faults,brand,salary,experience,results
0,28,5,1,4000RM,10.0,0
1,62,2,3,4500RM,40.0,0
2,45,7,2,,20.0,1
3,34,3,2,3200RM,,1
4,45,1,3,,,1
5,19,0,3,RM2500,1.0,1
6,23,1,1,0,3.0,0
7,55,2,3,RM7000,30.0,0
8,48,3,3,RM10000,25.0,1
9,50,2,2,0,30.0,0


In [2]:
# Finding the shape of the dataset
df.shape

(25, 6)

In [3]:
# Finding the columns of the dataset
df.columns

Index(['age', 'driving faults', 'brand', 'salary', 'experience', 'results'], dtype='object')

In [4]:
# Finding the basic information about all columns in the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             25 non-null     int64  
 1   driving faults  25 non-null     int64  
 2   brand           25 non-null     int64  
 3   salary          20 non-null     object 
 4   experience      20 non-null     float64
 5   results         25 non-null     int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 1.3+ KB


In [5]:
# Checking for null values in each column
df.isna().sum()

age               0
driving faults    0
brand             0
salary            5
experience        5
results           0
dtype: int64

# Data Preprocessing

In [6]:
# Removing "RM" and "rm" units from "salary" column 
df["salary"] = df["salary"].str.replace('\D', '', regex = True)
df

Unnamed: 0,age,driving faults,brand,salary,experience,results
0,28,5,1,4000.0,10.0,0
1,62,2,3,4500.0,40.0,0
2,45,7,2,,20.0,1
3,34,3,2,3200.0,,1
4,45,1,3,,,1
5,19,0,3,2500.0,1.0,1
6,23,1,1,0.0,3.0,0
7,55,2,3,7000.0,30.0,0
8,48,3,3,10000.0,25.0,1
9,50,2,2,0.0,30.0,0


In [7]:
# Changing the "salary" column data type from object to int
df["salary"] = df["salary"].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             25 non-null     int64  
 1   driving faults  25 non-null     int64  
 2   brand           25 non-null     int64  
 3   salary          20 non-null     float64
 4   experience      20 non-null     float64
 5   results         25 non-null     int64  
dtypes: float64(2), int64(4)
memory usage: 1.3 KB


In [8]:
# Importing numpy
import numpy as np

# Replacing null values in "salary" and "experience" columns with their respective mean values
df["salary"] = df["salary"].replace(np.nan, round(df["salary"].mean()))
df["experience"] = df["experience"].replace(np.nan, round(df["experience"].mean()))
df

Unnamed: 0,age,driving faults,brand,salary,experience,results
0,28,5,1,4000.0,10.0,0
1,62,2,3,4500.0,40.0,0
2,45,7,2,5098.0,20.0,1
3,34,3,2,3200.0,17.0,1
4,45,1,3,5098.0,17.0,1
5,19,0,3,2500.0,1.0,1
6,23,1,1,0.0,3.0,0
7,55,2,3,7000.0,30.0,0
8,48,3,3,10000.0,25.0,1
9,50,2,2,0.0,30.0,0


In [9]:
# Checking for null values in each column
df.isna().sum()

age               0
driving faults    0
brand             0
salary            0
experience        0
results           0
dtype: int64

In [10]:
# Creating a new dataset to store cleaned dataset
df.to_csv("clean_dataset.csv", index = False)

# Model Development

In [11]:
# Loading and formating cleaned dataset into an array
newdf = np.loadtxt("clean_dataset.csv", delimiter = ",", skiprows = 1)

# The variable x contains the feature matrix
x = newdf[:,0:5]

# The variable y contains the output variable
y = newdf[:,5]

In [12]:
# Import relevant classes and methods from keras
from keras.models import Sequential
from keras.layers import Dense

# This is the neural network
predictor = Sequential()

# First layer
predictor.add(Dense(4, input_dim = 5, activation = 'relu'))

# Second layer
predictor.add(Dense(2, activation = 'relu'))

# Third layer
predictor.add(Dense(1, activation = 'sigmoid'))

In [13]:
# Compile model with appropriate parameters
predictor.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [14]:
# Fitting the model
predictor.fit(x, y, epochs = 150, batch_size = 25)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x27f29091c70>

In [15]:
# Evaluate model accuracy
_,accuracy = predictor.evaluate(x, y)
print("Accuracy: ", accuracy)

Accuracy:  0.5600000023841858


# Model Testing

In [16]:
# Testing against original data
predictions = predictor.predict_on_batch(x)

# Prediction in decimal form
for i in range(5):
     print(x[i].tolist(), "predicts", predictions[i], " ACTUAL : ", y[i])

[28.0, 5.0, 1.0, 4000.0, 10.0] predicts [0.5210328]  ACTUAL :  0.0
[62.0, 2.0, 3.0, 4500.0, 40.0] predicts [0.5210328]  ACTUAL :  0.0
[45.0, 7.0, 2.0, 5098.0, 20.0] predicts [0.5210328]  ACTUAL :  1.0
[34.0, 3.0, 2.0, 3200.0, 17.0] predicts [0.5210328]  ACTUAL :  1.0
[45.0, 1.0, 3.0, 5098.0, 17.0] predicts [0.5210328]  ACTUAL :  1.0


In [17]:
# Prediction in rounded off form
for i in range(5):
     print(x[i].tolist(), "predicts", np.round(predictions[i],0), " ACTUAL : ", y[i])

[28.0, 5.0, 1.0, 4000.0, 10.0] predicts [1.]  ACTUAL :  0.0
[62.0, 2.0, 3.0, 4500.0, 40.0] predicts [1.]  ACTUAL :  0.0
[45.0, 7.0, 2.0, 5098.0, 20.0] predicts [1.]  ACTUAL :  1.0
[34.0, 3.0, 2.0, 3200.0, 17.0] predicts [1.]  ACTUAL :  1.0
[45.0, 1.0, 3.0, 5098.0, 17.0] predicts [1.]  ACTUAL :  1.0


In [18]:
# Loading prediction dataset
predict = np.loadtxt("predict.csv", delimiter = ",", skiprows = 1)

xx = predict[:,0:5]

In [19]:
# Testing and predicting outcome
predictions = predictor.predict_on_batch(xx)

# Prediction in decimal form
for i in range(5):
    print(xx[i].tolist(), "predicts", predictions[i])

[28.0, 5.0, 1.0, 4500.0, 10.0] predicts [0.5210328]
[62.0, 2.0, 3.0, 3500.0, 40.0] predicts [0.5210328]
[45.0, 7.0, 2.0, 5000.0, 32.0] predicts [0.5210328]
[34.0, 3.0, 2.0, 7800.0, 15.0] predicts [0.5210328]
[18.0, 1.0, 3.0, 0.0, 1.0] predicts [2.799986e-05]


In [20]:
# Prediction in rounded off form
for i in range(5):
    print(xx[i].tolist(), "predicts", np.round(predictions[i],0))

[28.0, 5.0, 1.0, 4500.0, 10.0] predicts [1.]
[62.0, 2.0, 3.0, 3500.0, 40.0] predicts [1.]
[45.0, 7.0, 2.0, 5000.0, 32.0] predicts [1.]
[34.0, 3.0, 2.0, 7800.0, 15.0] predicts [1.]
[18.0, 1.0, 3.0, 0.0, 1.0] predicts [0.]
