## Multiple Linear Regression on Insurance Dataset for Predicting EMI Charges of a Customer

In [32]:
import pandas as pd 

In [34]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [36]:
#Handling Missing Values
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

There are no missing values if they were present then  #df.dropna(inplace=True)

In [39]:
#Handling Duplicate Values
df.duplicated().sum()

1

In [41]:
#Deleting Duplicate Values
df.drop_duplicates(inplace=True)

In [43]:
df.duplicated().sum()

0

In [49]:
#Checking the data
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [47]:
#We need to change sex, smoker and region to numerical by one hot encoding

In [66]:
df2 = pd.get_dummies(df,drop_first='if_binary').astype('int')

In [68]:
df2

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27,0,16884,0,1,0,0,1
1,18,33,1,1725,1,0,0,1,0
2,28,33,3,4449,1,0,0,1,0
3,33,22,0,21984,1,0,1,0,0
4,32,28,0,3866,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30,3,10600,1,0,1,0,0
1334,18,31,0,2205,0,0,0,0,0
1335,18,36,0,1629,0,0,0,1,0
1336,21,25,0,2007,0,0,0,0,1


In [70]:
#Now the complete dataset is converted into numerical format

In [72]:
#Checking again for null values
df2.isnull().sum()

age                 0
bmi                 0
children            0
charges             0
sex_male            0
smoker_yes          0
region_northwest    0
region_southeast    0
region_southwest    0
dtype: int64

In [94]:
#Splitting the data into X and Y variable 
x = df2.drop('charges', axis='columns')
y = df2[['charges']]

In [96]:
#Training and Testing Data

In [98]:
from sklearn.model_selection import train_test_split

In [100]:
#Splitting the data into training and testing data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [102]:
#Model Training using Linear Regression
from sklearn.linear_model import LinearRegression

In [104]:
lr = LinearRegression()

In [106]:
lr.fit(x_train,y_train)

In [109]:
#Evaluation of Model on Training Data
lr.score(x_train,y_train)

0.7509409274756133

In [113]:
lr.score(x_test,y_test)

0.7479729927273205

In [117]:
#Predictions made by algorithm on testing data
predictions = lr.predict(x_test)
y_test['predicted_charges'] = predictions 

In [119]:
y_test

Unnamed: 0,charges,predicted_charges
575,12222,11856.088685
980,25517,10855.316555
1087,11353,12816.680204
542,13887,15715.885306
574,13224,15604.067136
...,...,...
351,8932,8760.386056
199,14901,18121.567485
425,9788,9260.702548
697,40273,33851.404353


In [121]:
#Saving the model using Joblib

In [123]:
import joblib

In [125]:
joblib.dump(lr,'linear_regression_insurance.lb')

['linear_regression_insurance.lb']