In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
import statsmodels.api as sm
import statsmodels.tsa.api as smt
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.compat import lzip
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import matplotlib


In [None]:
df=pd.read_csv('../input/insurance-premium-prediction/insurance.csv')

In [None]:
df.head()

In [None]:
df.shape

There are 1338 rows and 7 columns.

In [None]:
df.dtypes

As per the dtype output:
Numerical variables: age,children,bmi and charges.
Categorical variables: sex,smoker and region.


In [None]:
df.describe()

The age of the primary beneficiary ranges between 18 and 64.
The average bmi of the people is 30.4
The number of children/dependants vary in the range of 0 to 5.
The expenses column values are right skewed since mean is greater than the median value.

In [None]:
df.describe(include='object')

Most of them are males in the given table.
1064 people doesn't smoke.
There are 676 males in the total number of records.
364 people are from the southeast region.


In [None]:
df.isnull().sum()

There is no missing data in the record.

In [None]:
#outliers for age

ul=51+1.5*(51-27)
ll=51-1.5*(51-27)
print(ll,ul)

In [None]:
#listing outliers
print(df[df['age']<15])
print(df[df['age']>87])

In [None]:
#outliers for charges
ul=16639+1.5*(16639-4740)
ll=16639-1.5*(1663-4740)
print(ll,ul)

In [None]:
#listing outliers
print(df[df['expenses']<21254])


In [None]:
print(df[df['expenses']>34487])

There are more number of outilers for the charges column.

In [None]:
#outliers for bmi
ul=34.69+1.5*(34.69-26.29)
ll=34.69-1.5*(34.69-26.29)
print(ll,ul)

In [None]:
#listing outliers
print(df[df['bmi']<22.09])

In [None]:
#listing outliers
print(df[df['bmi']>47.289])

We observe outliers for bmi as well.

In [None]:
#a
# Treating outliers
df['expenses']=np.log(df['expenses'])
df['expenses'].plot(kind='box')

We applied log transform to the charges column to treat the outilers.
We observe that there are no outliers and also the data is not skewed much.

In [None]:
df['bmi'].plot(kind='box')

In [None]:
df['bmi']=np.sqrt(df['bmi'])
df['bmi'].plot(kind='box')

We applied a square root transform to the bmi column and observe a significant change in the number of ouliers.

In [None]:
corr=df.corr()
ax=sns.heatmap(corr,annot=True)
bottom,top = ax.get_ylim()
ax.set_ylim(bottom+0.5,top -0.5)

From the heat map we observe that our target variable(expenses) is greatly affected by the age parameter.bmi affects our target variable the least.

We can also use a scatter plot to observe the relatinship between the target variable (expenses) and age parameter.

In [None]:
sns.scatterplot(x='expenses',y='age',data=df)

We can observe from the above plot as well that the age parameter affects our target variable (charges). 

We can convert the categorical variables into numerical variables as they might have significance impact in determining the target variable (charges).

In [None]:
# converting sex,region and smoker columns
df=pd.get_dummies(data=df,columns=['sex','region','smoker'])

In [None]:
df.head()

In [None]:
df1=df.copy()

In [None]:
#Defining X and y to build the regression model
X=df.drop('expenses',axis=1)
y=df['expenses']

In [None]:
#Splitting into train and test data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=1)

#  Model Building

In [None]:
lin_reg=LinearRegression()
lin_reg.fit(X,y)

In [None]:
print('Co-efficients:',lin_reg.coef_)

For every single increase in the bmi the expense increases by 0.15 number of times.


In [None]:
print('r2:',lin_reg.score(X,y))

The r2 value comes to 0.759 which implies that the model is around 76% accurate in predicting our target variable( expenses).

In [None]:
from sklearn.metrics import mean_squared_error as mse

In [None]:
#rmse calculation
y_pred=lin_reg.predict(X)
mse(y,y_pred)

In [None]:
rmse=np.sqrt(0.002531)

In [None]:
print('The rmse value is:',rmse)

rmse value is very low which implies that the error rate for the model is very low. Hence the created model is performing well.


In [None]:
from sklearn.metrics import mean_absolute_error as mae

In [None]:
mae(y,y_pred)

Mean absolute error value is the difference in predicted and actual values by taking an absolute to them after summation.
We get the value of 0.27 which is very low. Hence the performance of the model is good.

In [None]:
#ols model
X_constant = sm.add_constant(X)
lin_reg=sm.OLS(y,X_constant).fit()
lin_reg.summary()

In [None]:
fig_d=(10,10)
fig,ax=plt.subplots(figsize=fig_d)
corr=df.corr()
ax=sns.heatmap(corr,annot=True,ax=ax)
bottom,top = ax.get_ylim()
ax.set_ylim(bottom+0.5,top -0.5)

In [None]:
# selecting highly correlated features
cor_target=abs(corr['expenses'])
relevant_features=cor_target[cor_target>0.5]
relevant_features

From the above output we can say that the independant variables age,smoker_no or smoker_yes affects the prediction of our target variable (expenses) the most.

We will use LASSO method as well to check for the best features.

In [None]:
#LASSO 
reg=LassoCV()
reg.fit(X,y)
coef=pd.Series(reg.coef_,index=X.columns)
imp_coef=coef.sort_values()
matplotlib.rcParams['figure.figsize']=(8.0,10.0)
imp_coef.plot(kind='barh')
plt.title('Important features using LASSO')

The lasso model picked smoker_no,bmi,age,sex_female,region_northeast and children as the significant features in predicting the charges (target variable).


By comparing the results from both the methods we will finalise the variables to be taken.
We will take the variables sex_female,smoker_no, age,chidren, region_northeast for our further modelling.

In [None]:
X=df1.drop(['expenses','bmi','region_southeast','region_southwest'],axis=1)
y=df1['expenses']

we have created X and y again. Now we will create a model for the same.

In [None]:
#ols model
X_constant = sm.add_constant(X)
lin_reg=sm.OLS(y,X_constant).fit()
lin_reg.summary()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=1)

In [None]:
lin_reg=LinearRegression()
lin_reg.fit(X,y)

In [None]:
print('r2:',lin_reg.score(X,y))

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=1)

In [None]:
print('r2 for train:',lin_reg.score(X_train,y_train))
print('r2 for test:',lin_reg.score(X_test,y_test))

The R2 value for the test data is greater than that of our train data. 
Hence the model created is performing very good.
The training is done with optimal features which in turn facilitiates good performance with the test data which is unknown to the model.

It is 77% efficient with the test data.

# Implications of the model

The charges are affected the most with age and smoking habits.
The people from norteast region tend to pay more for the expenses (medical cost).
Some of the vital features affect charges the most. 
So the client can focus more on clients who smoke and aged person.