In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# reading the dataset
dataset=pd.read_csv('D:\Python Data Analysis\insurance\insurance.csv')

In [3]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


Here, in the dataset, we have to change sex, smoker and region data to a categorical value and then assign numerical codes to each category

In [4]:
dataset['sex']=dataset['sex'].astype('category')
dataset['sex']=dataset['sex'].cat.codes
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,yes,southwest,16884.92
1,18,1,33.8,1,no,southeast,1725.55
2,28,1,33.0,3,no,southeast,4449.46
3,33,1,22.7,0,no,northwest,21984.47
4,32,1,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,1,31.0,3,no,northwest,10600.55
1334,18,0,31.9,0,no,northeast,2205.98
1335,18,0,36.9,0,no,southeast,1629.83
1336,21,0,25.8,0,no,southwest,2007.95


In [5]:
dataset['smoker']=dataset['smoker'].astype('category')
dataset['smoker']=dataset['smoker'].cat.codes

dataset['region']=dataset['region'].astype('category')
dataset['region']=dataset['region'].cat.codes
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,3,16884.92
1,18,1,33.8,1,0,2,1725.55
2,28,1,33.0,3,0,2,4449.46
3,33,1,22.7,0,0,1,21984.47
4,32,1,28.9,0,0,1,3866.86
...,...,...,...,...,...,...,...
1333,50,1,31.0,3,0,1,10600.55
1334,18,0,31.9,0,0,0,2205.98
1335,18,0,36.9,0,0,2,1629.83
1336,21,0,25.8,0,0,3,2007.95


In [6]:
# Now let's check for any missing data
dataset.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

There are no any missing values

#### The formula for multiple liner regression is y=m1X1+ m2X2+m3X3+.....mnXn +c

Since charges is (y) dependent variable, we will drop it from the dataset to make the rest dataset as X (multiple independent vairbales)


In [8]:
X=dataset.drop(columns='expenses')

In [9]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.8,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.7,0,0,1
4,32,1,28.9,0,0,1
...,...,...,...,...,...,...
1333,50,1,31.0,3,0,1
1334,18,0,31.9,0,0,0
1335,18,0,36.9,0,0,2
1336,21,0,25.8,0,0,3


In [10]:
y=dataset['expenses']
y

0       16884.92
1        1725.55
2        4449.46
3       21984.47
4        3866.86
          ...   
1333    10600.55
1334     2205.98
1335     1629.83
1336     2007.95
1337    29141.36
Name: expenses, Length: 1338, dtype: float64

#### Now let's split the dataset into training and testing dataset

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
# random_state - can be any number. it will help you to create the same train and test section if needed

In [14]:
from sklearn.linear_model import LinearRegression
LRM=LinearRegression() # regressor object is created
LRM.fit(X_train,y_train)
# the model is now trained

In [15]:
m=LRM.coef_
m

array([  256.54688301,   -49.5556959 ,   329.03829593,   479.33098055,
       23399.22241438,  -276.24306229])

In [17]:
c=LRM.intercept_ 
c
# one coefficient each for age,sex,bmi,children,smoker and region

-11827.690243440878

#### Let's predict the expenses for 50 years old male with bmi 29.9, no children (0), no smoker(0) and living in southwest region(3)

In [24]:
LRM.predict([[50,0,29.9,0,0,3]])



array([10009.16976842])

In [25]:
Ins_df=pd.read_csv('D:\Python Data Analysis\insurance\insurance_predict.csv')

In [26]:
Insurance_cost=LRM.predict(Ins_df)
Insurance_cost


array([24797.3622176 , 27464.40268976, 30498.61547785,  3229.24133066,
        5614.77370286,  3979.5055147 ])

In [27]:
Ins_df['InsuranceCost']=Insurance_cost

In [28]:
Ins_df

Unnamed: 0,age,sex,bmi,children,smoker,region,InsuranceCost
0,19,0,27.9,0,1,3,24797.362218
1,18,1,33.8,1,1,1,27464.40269
2,28,0,33.0,3,1,2,30498.615478
3,33,1,22.7,0,0,3,3229.241331
4,32,0,28.9,0,0,1,5614.773703
5,31,1,25.7,0,0,2,3979.505515


In [29]:
Ins_df.to_csv('D:\Python Data Analysis\insurance\insurance_predict.csv')