##Gathering data

In [1]:
#importing packages
import numpy as np
import pandas as pd

In [None]:
data=pd.read_csv("../data/insurance.csv")

In [3]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


##Data preprocessing 

In [5]:
#one hot encoding - will convert the categorical variables into numerical variables
data['sex']=data['sex'].apply({'male':1,'female':0}.get)

In [6]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,yes,southwest,16884.92
1,18,1,33.8,1,no,southeast,1725.55
2,28,1,33.0,3,no,southeast,4449.46
3,33,1,22.7,0,no,northwest,21984.47
4,32,1,28.9,0,no,northwest,3866.86


In [7]:
data['smoker']=data['smoker'].apply({'yes':1,'no':0}.get)
data['region']=data['region'].apply({'southeast':1,'southwest':0,'northeast':2,'northwest':3}.get)

In [8]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,0,16884.92
1,18,1,33.8,1,0,1,1725.55
2,28,1,33.0,3,0,1,4449.46
3,33,1,22.7,0,0,3,21984.47
4,32,1,28.9,0,0,3,3866.86


##Dividing the data into dependent and independent variables

In [9]:
x=data[['age','sex','bmi','children','smoker','region']] #independent variables
y=data['expenses'] #dependent variable

##Splitting the data into training and testing set

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) 
#test_size=0.2 that means 20% of the data will be used for testing the accuracy of the model 
#and 80% of the data will be used for making the model learn

##Creating a ML model using Linear Regression algorithm

In [12]:
#import Linear Regression algorithm
from sklearn.linear_model import LinearRegression
linear_model=LinearRegression() #we are activating the algorithm for use 

In [13]:
#training the model using .fit()
linear_model.fit(x_train,y_train)

LinearRegression()

In [16]:
y_pred=linear_model.predict(x_test) #.predict() used for predicting the future values

In [17]:
print(y_pred) #y_pred is the amount of health insurance as calculated by the ML model

[35406.94378448  9671.99892553 10583.65465844 25678.70035255
  7518.94550925 30801.9757518   9395.00010306  8081.91659247
  6266.37093262 12068.05361395 26332.94411169  6526.81904675
  9412.25919032  6107.25487377  4279.05263589 35655.44213924
 34863.6927308  28158.63789677  9967.06313919   303.29900491
  3677.42118138 12786.40068487 11524.31506912  1929.17894614
  6300.84395409 35565.29404058 25715.04732382  5338.22142274
 11695.06236325 15087.25488756  9178.60445445 30104.78478632
  1905.38802873  5878.54784136 14200.42143926 13517.96022991
  7383.60364891 12541.66854101  3595.32871179 10060.66357654
  5930.14025235 29595.55789911 31412.82502592  3004.00677189
  3478.07549452 14275.89695487 14286.43936118  9304.96140297
  6025.6185733  10088.07761071  7582.68979577 31755.56326161
 13578.65637214  5587.87750386   974.5021149   9225.9074312
  8118.13980374  5665.92098675 38238.64468869 17630.96728431
 10373.3529661  12909.34314969  7082.15304064  3241.75329003
 24649.14253794 13978.628

In [18]:
#checking accuracy of the ML model
accuracy=linear_model.score(x,y)

In [19]:
print("The accuracy of the ML model using Linear Regression algorithm is",accuracy)

The accuracy of the ML model using Linear Regression algorithm is 0.7498107243343908


##Predicting the health insurance of a new customer

In [30]:
#creating data for a new customer
data_new={'age':65,'sex':0,'bmi':26,'children':3,'smoker':1,'region':3}

In [31]:
#converting the dict into a dataframe
index=[1] #serial number
my_data=pd.DataFrame(data_new,index)

In [32]:
my_data

Unnamed: 0,age,sex,bmi,children,smoker,region
1,65,0,26,3,1,3


In [33]:
#predicting the amount of health insurance the new customer is going to get
predictions=linear_model.predict(my_data)

In [34]:
print("The amount of health insurance for new customer is",predictions)

The amount of health insurance for new customer is [37907.49302769]
