In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('insurance.csv')


# 1.Display Top 5  Rows of the Dataset.

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# 2.Display Last 5 Rows of the Dataset.

In [4]:
df.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


# 3. Find Shape of Our Dataset (Number of Rows And Number of Columns

In [5]:
df.shape

(1338, 7)

In [6]:
print("Number of Rows :", df.shape[0])
print("Number of Columns :", df.shape[1])


Number of Rows : 1338
Number of Columns : 7


# 4. Get Information About Our Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column And Memory Requirement

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


# 5.Check Null Values In The Dataset

In [8]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# 6. Get Overall Statistics About The Dataset

In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


# 7. Covert Columns From String ['sex' ,'smoker','region' ] To Numerical Values

In [10]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [11]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [12]:
df['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [13]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [14]:
df['sex'] = df['sex'].map({'female':0,'male':1})
df['smoker'] = df['smoker'].map({'yes':1,'no':0})
df['region'] = df['region'].map({'southwest':1,'southeast':2,'northwest':3,'northeast':4})

In [15]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,1,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,3,21984.47061
4,32,1,28.88,0,0,3,3866.8552


# 8. Store Feature Matrix In X and Response(Target) In Vector y

In [16]:
X = df.drop(columns='charges')
y = df['charges']

# 9. Train/Test split
1. Split data into two part : a training set and a testing set
2. Train the model(s) on training set
3. Test the Model(s) on Testing set


In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=42)
X_train.shape[0],X_test.shape[0],y_train.shape[0],y_test.shape[0]

(936, 402, 936, 402)

# 10. Import the models

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [19]:
logreg = LinearRegression()
logreg.fit(X_train,y_train)
sv = SVR()
sv.fit(X_train,y_train)
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
gr = GradientBoostingRegressor()
gr.fit(X_train,y_train)

GradientBoostingRegressor()

# 12. Prediction on Test Data

In [20]:
y_pred1 = logreg.predict(X_test)
y_pred2 = sv.predict(X_test)
y_pred3 = rf.predict(X_test)
y_pred4 = gr.predict(X_test)

df1 = pd.DataFrame({'Actual':y_test,'Lr':y_pred1,
                  'svm':y_pred2,'rf':y_pred3,'gr':y_pred4})
df1[:4]

Unnamed: 0,Actual,Lr,svm,rf,gr
764,9095.06825,8931.421164,9505.525492,9785.060526,10613.512496
887,5272.1758,7070.906703,9455.571398,5841.174208,5680.434246
890,29330.98315,36937.080496,9595.485762,28272.892197,29627.694391
1293,9301.89355,9596.992144,9511.612207,10045.181633,9541.852526


# 14. Evaluating the Algorithm

In [21]:
from sklearn import metrics

In [22]:
score1 = metrics.r2_score(y_test,y_pred1)
score2 = metrics.r2_score(y_test,y_pred2)
score3 = metrics.r2_score(y_test,y_pred3)
score4 = metrics.r2_score(y_test,y_pred4)

In [23]:
print(score1,score2,score3,score4)

0.7694415927057693 -0.08170471274983182 0.8531360663719876 0.8690793872279976


In [24]:
s1 = metrics.mean_absolute_error(y_test,y_pred1)
s2 = metrics.mean_absolute_error(y_test,y_pred2)
s3 = metrics.mean_absolute_error(y_test,y_pred3)
s4 = metrics.mean_absolute_error(y_test,y_pred4)

In [25]:
print(s1,s2,s3,s4)

4155.239843059381 8229.804004726911 2573.4290833104487 2435.025139563308


# 15. Predict Charges For New Customer

In [26]:
data = {'age' : 40,
        'sex' : 1,
        'bmi' : 40.30,
        'children' : 4,
        'smoker' : 1,
        'region' : 2}

In [27]:
df = pd.DataFrame(data,index=[0])
df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,40,1,40.3,4,1,2


In [28]:
new_pred = gr.predict(df)
print("Medical Insurance cost for New Customer is : ",new_pred[0])

Medical Insurance cost for New Customer is :  44225.330260008544


# From above we found that GradientBoostingRegressor is the best model for this dataset. Before production, it is good practice to train our model on the entire dataset.

In [29]:
gr = GradientBoostingRegressor()
gr.fit(X,y)

import joblib 

joblib.dump(gr, "insurance_price_prediction")

['insurance_price_prediction']

# Predict Charges For New Customer

In [30]:
new_pred = gr.predict(df)
print("Medical Insurance cost for New Customer is : ",new_pred[0])

Medical Insurance cost for New Customer is :  42148.36188800322


In [31]:
from tkinter import *
import joblib

In [38]:
def show_entry():
    p1 = float(e1.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())
    
    model = joblib.load('insurance_price_prediction')
    result = model.predict([[p1,p2,p3,p4,p5,p6]])
    Label(master,text = 'Prediction').grid(row = 7)
    Label(master, text = result).grid(row = 8)
    
    
master = Tk()


master.title("INSURANCE PRICE PREDICTOR")
label = Label(master,text = "INSURANCE PRICE PREDICTOR",bg ="black", fg= "white").grid(row= 0, columnspan =2)
Label(master,text = 'ENTER YOUR AGE').grid(row = 1)
Label(master,text ='MALE OR FEMALE[1/0]').grid(row = 2)
Label(master,text ='ENTER YOUR BMI VALUE').grid(row = 3)
Label(master,text ='ENTER NUMBER OF CHILDREN').grid(row = 4)
Label(master,text ='SMOKER[1/0]').grid(row = 5)
Label(master,text ='REGION[1-4]').grid(row = 6)



e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)


e1.grid(row = 1, column = 1)
e2.grid(row = 2, column = 1)
e3.grid(row = 3, column = 1)
e4.grid(row = 4, column = 1)
e5.grid(row = 5, column = 1)
e6.grid(row = 6, column = 1)

Button(master, text= "PREDICT", command = show_entry).grid()


master.mainloop()