In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn

In [3]:
## reading data 

data=pd.read_csv('E:/datasets/insurance.csv')

In [4]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
from sklearn.linear_model import LinearRegression

In [6]:
## independent variables 
X=data.drop('charges',axis=1)

In [7]:
# traget variables
Y=data.charges

In [8]:
# dummy variable creation 
data_dummy=pd.get_dummies(data[['sex','smoker','region']])

In [9]:
X=X.drop(['sex','smoker','region'],axis=1)

In [10]:
data_dummy

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,0,1,1,0,0,1,0,0
1334,1,0,1,0,1,0,0,0
1335,1,0,1,0,0,0,1,0
1336,1,0,1,0,0,0,0,1


In [11]:
## combining dummy variable to other i.v
X=pd.concat([X,data_dummy],axis=1)

In [12]:
X.shape

(1338, 11)

In [13]:
## trying model on whole data 
lm=LinearRegression()

In [14]:
model=lm.fit(X,Y) 

In [15]:
## predicting on whole data 
pred_charge=model.predict(X)

In [16]:
## r square 
model.score(X,Y)

0.7509130345985208

In [17]:
 pd.Series(model.coef_)

0       256.856353
1       339.193454
2       475.500545
3        65.657180
4       -65.657180
5    -11924.267271
6     11924.267271
7       587.009235
8       234.045336
9      -448.012814
10     -373.041756
dtype: float64

In [18]:
X.columns

Index(['age', 'bmi', 'children', 'sex_female', 'sex_male', 'smoker_no',
       'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest'],
      dtype='object')

In [19]:
## variable with different cofficients 
pd.concat([pd.Series(X.columns),pd.Series(model.coef_)],axis=1)

Unnamed: 0,0,1
0,age,256.856353
1,bmi,339.193454
2,children,475.500545
3,sex_female,65.65718
4,sex_male,-65.65718
5,smoker_no,-11924.267271
6,smoker_yes,11924.267271
7,region_northeast,587.009235
8,region_northwest,234.045336
9,region_southeast,-448.012814


In [20]:
## building model using statsmodels 
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats




X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

  return ptp(axis=axis, out=out, **kwargs)


                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     500.8
Date:                Sat, 07 Mar 2020   Prob (F-statistic):               0.00
Time:                        12:22:38   Log-Likelihood:                -13548.
No. Observations:                1338   AIC:                         2.711e+04
Df Residuals:                    1329   BIC:                         2.716e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             -296.4168    430.507  

In [21]:
## using significant variables based on P values 
X1=X.loc[:,['age','bmi','children','smoker_no','smoker_yes']]
X21 = sm.add_constant(X1)
est = sm.OLS(Y, X21)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.750
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     998.1
Date:                Sat, 07 Mar 2020   Prob (F-statistic):               0.00
Time:                        12:22:38   Log-Likelihood:                -13551.
No. Observations:                1338   AIC:                         2.711e+04
Df Residuals:                    1333   BIC:                         2.714e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -131.3796    629.912     -0.209      0.8

In [23]:
## dividing data in train and test 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3)

In [24]:
x_train

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
1154,48,27.930,4,1,0,1,0,0,1,0,0
332,61,31.160,0,1,0,1,0,0,1,0,0
122,20,28.975,0,1,0,1,0,0,1,0,0
860,37,47.600,2,1,0,0,1,0,0,0,1
1072,21,31.255,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
210,20,33.000,1,0,1,1,0,0,0,0,1
459,40,33.000,3,1,0,1,0,0,0,1,0
1178,23,34.865,0,1,0,1,0,1,0,0,0
573,62,36.860,1,1,0,1,0,1,0,0,0


In [25]:
## model building on train data 
X2_train = sm.add_constant(x_train)
est = sm.OLS(y_train, X2_train)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.760
Model:                            OLS   Adj. R-squared:                  0.758
Method:                 Least Squares   F-statistic:                     366.8
Date:                Sat, 07 Mar 2020   Prob (F-statistic):          4.65e-281
Time:                        12:23:16   Log-Likelihood:                -9484.1
No. Observations:                 936   AIC:                         1.899e+04
Df Residuals:                     927   BIC:                         1.903e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             -313.4539    516.578  

  return ptp(axis=axis, out=out, **kwargs)


In [26]:
## model using only signicant variables based on p values 
X2_train1=x_train.loc[:,['age','bmi','children','smoker_no','smoker_yes']]

In [27]:
X2_train2 = sm.add_constant(X2_train1)
est = sm.OLS(y_train, X2_train2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.759
Model:                            OLS   Adj. R-squared:                  0.758
Method:                 Least Squares   F-statistic:                     732.2
Date:                Sat, 07 Mar 2020   Prob (F-statistic):          1.12e-285
Time:                        12:23:18   Log-Likelihood:                -9486.2
No. Observations:                 936   AIC:                         1.898e+04
Df Residuals:                     931   BIC:                         1.901e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -111.5689    747.407     -0.149      0.8

In [28]:
## prediction on train data 
x_train_pred=est2.predict(X2_train2)

In [29]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_train = sqrt(mean_squared_error(y_train, x_train_pred))

In [30]:
## error on train data 
rms_train

6099.287162745421

In [31]:
X2_test1=x_test.loc[:,['age','bmi','children','smoker_no','smoker_yes']]
X2_test2 = sm.add_constant(X2_test1)

  return ptp(axis=axis, out=out, **kwargs)


In [32]:
## prediction on test data 
x_test_pred=est2.predict(X2_test2)

In [33]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_test = sqrt(mean_squared_error(y_test, x_test_pred))

In [34]:
## error on test data 
rms_test

5982.043875677294