In [95]:
import numpy as np
import pandas as pd

# visulization
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
import warnings
warnings.filterwarnings('ignore')

import pickle
import json

# problem statement

In [96]:
# To predict the medical insurance price with the help of supervised machine learning algorithm with linear regression.

# data gathering

In [97]:
df=pd.read_csv('medical_insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [99]:
df['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [100]:
df['sex'].replace({'male':1,'female':0},inplace=True)

In [101]:
df['sex'].value_counts()

1    676
0    662
Name: sex, dtype: int64

In [102]:
sex_value={'male':1,'female':0}

In [103]:
df['smoker'].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [104]:
df['smoker'].replace({'no':1,'yes':0}, inplace=True)


In [105]:
smoker_value={'no':1,'yes':0}

In [106]:
df['smoker'].value_counts()

1    1064
0     274
Name: smoker, dtype: int64

In [107]:
df['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [108]:
df=pd.get_dummies(df,columns=['region'])
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,0,16884.92400,0,0,0,1
1,18,1,33.770,1,1,1725.55230,0,0,1,0
2,28,1,33.000,3,1,4449.46200,0,0,1,0
3,33,1,22.705,0,1,21984.47061,0,1,0,0
4,32,1,28.880,0,1,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,1,10600.54830,0,1,0,0
1334,18,0,31.920,0,1,2205.98080,1,0,0,0
1335,18,0,36.850,0,1,1629.83350,0,0,1,0
1336,21,0,25.800,0,1,2007.94500,0,0,0,1


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   sex               1338 non-null   int64  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int64  
 5   charges           1338 non-null   float64
 6   region_northeast  1338 non-null   uint8  
 7   region_northwest  1338 non-null   uint8  
 8   region_southeast  1338 non-null   uint8  
 9   region_southwest  1338 non-null   uint8  
dtypes: float64(2), int64(4), uint8(4)
memory usage: 68.1 KB


# model building

In [110]:
x=df.drop('charges',axis=1)
y=df['charges']

In [111]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=11)
x_train

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
316,50,1,32.205,0,1,0,1,0,0
340,24,0,27.600,0,1,0,0,0,1
151,48,1,29.700,0,1,0,0,1,0
1282,18,0,21.660,0,0,1,0,0,0
557,34,1,34.210,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
583,32,0,23.650,1,1,0,0,1,0
332,61,0,31.160,0,1,0,1,0,0
1293,46,1,25.745,3,1,0,1,0,0
1115,55,1,32.670,1,1,0,0,1,0


In [112]:
column_names=x.columns
column_names

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

# instance creation

In [113]:
model1=LinearRegression()
model1.fit(x_train,y_train)

LinearRegression()

In [114]:
# model evaluation
# testing accuracy
y_pred=model1.predict(x_test)
mse=mean_squared_error(y_test,y_pred)
print('mse=',mse)
rmse=np.sqrt(mse)
print('rmse=',rmse)
mae=mean_absolute_error(y_test,y_pred)
print('mae=',mae)
r2score=r2_score(y_test,y_pred)
print('r2score=',r2score)
adj_r2=1-(((1-r2score)*(x_test.shape[0]-1))/(x_test.shape[0]-x_test.shape[1]-1))
print('adj_r2=',adj_r2)

mse= 27781250.089006577
rmse= 5270.792169020381
mae= 3804.198416541934
r2score= 0.7996633930323263
adj_r2= 0.7926749067427562


In [115]:
# model evaluation
# trainning accuracy
y_pred_train=model1.predict(x_train)
mse=mean_squared_error(y_train,y_pred_train)
print('mse=',mse)
rmse=np.sqrt(mse)
print('rmse=',rmse)
mae=mean_absolute_error(y_train,y_pred_train)
print('mae=',mae)
r2score=r2_score(y_train,y_pred_train)
print('r2score=',r2score)
adj_r2=1-(((1-r2score)*(x_train.shape[0]-1))/(x_train.shape[0]-x_train.shape[1]-1))
print('adj_r2=',adj_r2)

mse= 38755582.190372266
rmse= 6225.398155168251
mae= 4308.584412061907
r2score= 0.7390019992731405
adj_r2= 0.736785978512252


In [116]:
x.head(1).T

Unnamed: 0,0
age,19.0
sex,0.0
bmi,27.9
children,0.0
smoker,0.0
region_northeast,0.0
region_northwest,0.0
region_southeast,0.0
region_southwest,1.0


In [117]:
with open ('linear_model.pkl','wb') as f:
    pickle.dump(model1,f)

In [118]:
json_data={'sex':sex_value,'smoker':smoker_value,'columns':list(column_names)}
json_data

{'sex': {'male': 1, 'female': 0},
 'smoker': {'no': 1, 'yes': 0},
 'columns': ['age',
  'sex',
  'bmi',
  'children',
  'smoker',
  'region_northeast',
  'region_northwest',
  'region_southeast',
  'region_southwest']}

In [119]:
with open('project_data.json','w') as f:
    json.dump(json_data,f)

In [120]:
# single user input testing
age=19

sex='male'
bmi=27
children=1
smoker='yes'
region='northeast'
region='region_'+region
region

region_index=np.where(column_names==region)[0][0]
region_index

5

In [121]:
region_index=list(column_names).index(region)
region_index

5

In [122]:
json_data['sex'][sex]

1

In [123]:
json_data['smoker'][smoker]

0

In [124]:
test_array=np.zeros(len(column_names))
test_array[0]=age
test_array[1]=json_data['sex'][sex]
test_array[2]=bmi
test_array[3]=children
test_array[4]=json_data['smoker'][smoker]
test_array[region_index]=1

test_array

array([19.,  1., 27.,  1.,  0.,  1.,  0.,  0.,  0.])

In [125]:
charges=round(model1.predict([test_array])[0],2)
print('predicted medical insurance charges is :',charges ,'/-Rs only')

predicted medical insurance charges is : 25809.17 /-Rs only
