In [1]:
import numpy as np
import pandas as pd

In [3]:
data=pd.read_csv("/content/insurance.csv")

In [4]:
data.head() # Only first 5 rows of the dataset will be displayed

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# We will perfom One Hot Encoding where we will change the categorical variable into numerical variable.

In [5]:
data['sex']=data['sex'].apply({'male':1,'female':0}.get) 

In [6]:
data['smoker']=data['smoker'].apply({'yes':1, 'no':0}.get)

In [7]:
data['region']=data['region'].apply({'northeast':1,'northwest':2,'southeast':3,'southwest':4}.get)

In [8]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,4,16884.924
1,18,1,33.77,1,0,3,1725.5523
2,28,1,33.0,3,0,3,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


# We will divide the data into dependent and independent columns. In this case charges is a dependent column and other columns are independent.

In [9]:
x=data[['age','sex','bmi','children','smoker','region']]

In [10]:
y=data['charges']

In [11]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,4
1,18,1,33.77,1,0,3
2,28,1,33.0,3,0,3
3,33,1,22.705,0,0,2
4,32,1,28.88,0,0,2


In [12]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

# Now we will split the data into training set and testing set

In [13]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3) # When test size is 0.3, only 30% of data will be used for testing purpose.

# Model building using Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression

model=LinearRegression()  # We are creating an object named 'model' of LinearRegression

In [18]:
model.fit(x_train,y_train) # Learning or training of model is going to take place, fit is a fit function.

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
predictions=model.predict(x_test) # Prediction made by Machine learning model, For predicting purpose we use x_test

In [21]:
predictions[0:5] # First five values of predictions will be displayed

array([ 7628.5464344 , 24275.22357871,  5713.02877468, 32999.89388019,
        3725.51524739])

# To know how much accurate is our prediction we will use R2 score

In [22]:
model.score(x,y)

0.7499240648255651

# The accuracy of the model is 74.99 %

# Now we will predict the insurance cost for a new customer

In [23]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,4,16884.924
1,18,1,33.77,1,0,3,1725.5523
2,28,1,33.0,3,0,3,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


In [25]:
data_new={'age':30,'sex':1,'bmi':25,'children':1,'smoker':0,'region':1} # We make a dictionary where we set the age of person as 30, sex as male, bmi as 25, children he is having as 1, smoker as no and region as northeast

index=[1]


In [26]:
new_customer_data=pd.DataFrame(data_new,index) # We create a data frame with values from data_new dictionary

In [27]:
new_customer_data 

Unnamed: 0,age,sex,bmi,children,smoker,region
1,30,1,25,1,0,1


In [28]:
prediction_new=model.predict(new_customer_data)          # We are prdicting the insurance cost of a new customer

print("Insurance cost for new customer is ", prediction_new)

Insurance cost for new customer is  [4423.34173923]


# So the the person having the age:30, sex: male, bmi: 25, children: 1 , smoker: no and region: northeast will get insurance of 4423.341 Rs 