# Imports

In [1]:
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm
import pandas as pd
import os 
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler

#### Function for Linear Regression

In [2]:
def linear_regression(y_train,x_train):
    return sm.OLS(y_train,x_train).fit()

#### Function for obtaining x_train, x_test, y_train, y_test By splitting 'Independent' variable and 'Target' variable

In [3]:
def train_test(independent, target):
    # Keeping test_size = 0.10 and random_state = 123
    return train_test_split(independent,target, test_size=0.10, random_state=123)

#### Function for splitting 'Data' into 'Independent' variable and 'Target' variable

In [4]:
def ind_and_target(df,dependent):
    independent = df.drop(dependent,axis=1)
    target = df[dependent]
    return (independent,target)

In [5]:
def runner(df):
    independent,target=ind_and_target(df,'claim')
    x_train, x_test, y_train, y_test=train_test(independent, target)
    return linear_regression(y_train,x_train)

#### Function for Normalization

In [6]:
def normalization_fun(df_num):
    return pd.DataFrame(MinMaxScaler().fit_transform(df_num.values), columns=df_num.columns, index=df_num.index)

#### Function for Standardization

In [7]:
def standardization_fun(df_num):
    return pd.DataFrame(StandardScaler().fit_transform(df_num.values), columns=df_num.columns, index=df_num.index)

### FullModel without scaling

In [8]:
df = pd.read_csv("insurance.csv")

In [9]:
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [10]:
# Encoding the Categorical data using N-1 dummy encoding
df_cat_encoded=pd.get_dummies(df_cat,drop_first=True)

In [11]:
# Merging the encoded data and numerical data
df_merged = pd.concat([df_num,df_cat_encoded],axis=1)

In [12]:
df_merged=sm.add_constant(df_merged)

In [13]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [14]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.748
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,353.5
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,15:41:33,Log-Likelihood:,-12189.0
No. Observations:,1204,AIC:,24400.0
Df Residuals:,1193,BIC:,24460.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.259e+04,1581.370,-7.960,0.000,-1.57e+04,-9484.619
age,260.0316,12.567,20.691,0.000,235.375,284.688
bmi,338.4297,30.701,11.023,0.000,278.196,398.664
bloodpressure,6.3434,9.997,0.635,0.526,-13.270,25.957
children,475.6105,144.592,3.289,0.001,191.927,759.294
gender_male,-98.6054,351.853,-0.280,0.779,-788.925,591.715
diabetic_Yes,-252.3240,351.365,-0.718,0.473,-941.687,437.039
smoker_Yes,2.363e+04,437.267,54.046,0.000,2.28e+04,2.45e+04
region_northwest,-469.6572,502.294,-0.935,0.350,-1455.136,515.821

0,1,2,3
Omnibus:,283.928,Durbin-Watson:,1.927
Prob(Omnibus):,0.0,Jarque-Bera (JB):,690.078
Skew:,1.259,Prob(JB):,1.42e-150
Kurtosis:,5.722,Cond. No.,1100.0


### FullModel with Normalization

In [15]:
df = pd.read_csv("insurance.csv")

In [16]:
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [17]:
# Encoding the Categorical data using N-1 dummy encoding
df_cat_encoded=pd.get_dummies(df_cat,drop_first=True)

In [18]:
# Scaling numerical data using Normaliztion or Min_Max Scaling
df_num_normalized = normalization_fun(df_num)

In [19]:
print(df_num_normalized)

           age       bmi  bloodpressure  children     claim
0     0.021739  0.320755       0.450000       0.0  0.251611
1     0.000000  0.479784       0.883333       0.2  0.009636
2     0.217391  0.458221       0.133333       0.6  0.053115
3     0.326087  0.180593       0.650000       0.0  0.333010
4     0.304348  0.347709       0.183333       0.0  0.043816
...        ...       ...            ...       ...       ...
1333  0.695652  0.404313       0.300000       0.6  0.151299
1334  0.000000  0.428571       0.966667       0.0  0.017305
1335  0.000000  0.563342       0.566667       0.0  0.008108
1336  0.065217  0.264151       0.683333       0.0  0.014144
1337  0.934783  0.353100       0.983333       0.0  0.447249

[1338 rows x 5 columns]


In [20]:
# Merging the encoded data and normalized numerical data
df_merged = pd.concat([df_num_normalized,df_cat_encoded],axis=1)

In [21]:
# Adding beta 0 value as constant
df_merged=sm.add_constant(df_merged)

In [22]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [23]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.748
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,353.5
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,15:41:38,Log-Likelihood:,1109.2
No. Observations:,1204,AIC:,-2196.0
Df Residuals:,1193,BIC:,-2140.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0496,0.012,-4.226,0.000,-0.073,-0.027
age,0.1909,0.009,20.691,0.000,0.173,0.209
bmi,0.2004,0.018,11.023,0.000,0.165,0.236
bloodpressure,0.0061,0.010,0.635,0.526,-0.013,0.025
children,0.0380,0.012,3.289,0.001,0.015,0.061
gender_male,-0.0016,0.006,-0.280,0.779,-0.013,0.009
diabetic_Yes,-0.0040,0.006,-0.718,0.473,-0.015,0.007
smoker_Yes,0.3772,0.007,54.046,0.000,0.364,0.391
region_northwest,-0.0075,0.008,-0.935,0.350,-0.023,0.008

0,1,2,3
Omnibus:,283.928,Durbin-Watson:,1.927
Prob(Omnibus):,0.0,Jarque-Bera (JB):,690.078
Skew:,1.259,Prob(JB):,1.42e-150
Kurtosis:,5.722,Cond. No.,11.0


### FullModel with Standardization

In [24]:
df = pd.read_csv("insurance.csv")

In [25]:
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [26]:
# Encoding the Categorical data using N-1 dummy encoding
df_cat_encoded=pd.get_dummies(df_cat,drop_first=True)

In [27]:
# Scaling numerical data using Standardization or z-score scaling
df_num_standardized = standardization_fun(df_num)

In [28]:
print(df_num_standardized)

           age       bmi  bloodpressure  children     claim
0    -1.438764 -0.453646      -0.136906 -0.908614  0.298583
1    -1.509965  0.514186       1.347719 -0.078767 -0.953689
2    -0.797954  0.382954      -1.221823  1.580926 -0.728675
3    -0.441948 -1.306650       0.548305 -0.908614  0.719843
4    -0.513149 -0.289606      -1.050521 -0.908614 -0.776802
...        ...       ...            ...       ...       ...
1333  0.768473  0.054876      -0.650814  1.580926 -0.220551
1334 -1.509965  0.202511       1.633223 -0.908614 -0.914002
1335 -1.509965  1.022707       0.262801 -0.908614 -0.961597
1336 -1.296362 -0.798128       0.662507 -0.908614 -0.930361
1337  1.551686 -0.256799       1.690324 -0.908614  1.311053

[1338 rows x 5 columns]


In [29]:
# Merging the encoded data and standardized numerical data
df_merged = pd.concat([df_num_standardized,df_cat_encoded],axis=1)

In [30]:
# Adding beta 0 value as constant
df_merged=sm.add_constant(df_merged)

In [31]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [32]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.748
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,353.5
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,15:41:38,Log-Likelihood:,-870.0
No. Observations:,1204,AIC:,1762.0
Df Residuals:,1193,BIC:,1818.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.3362,0.037,-9.080,0.000,-0.409,-0.264
age,0.3017,0.015,20.691,0.000,0.273,0.330
bmi,0.1704,0.015,11.023,0.000,0.140,0.201
bloodpressure,0.0092,0.014,0.635,0.526,-0.019,0.038
children,0.0473,0.014,3.289,0.001,0.019,0.076
gender_male,-0.0081,0.029,-0.280,0.779,-0.065,0.049
diabetic_Yes,-0.0208,0.029,-0.718,0.473,-0.078,0.036
smoker_Yes,1.9522,0.036,54.046,0.000,1.881,2.023
region_northwest,-0.0388,0.041,-0.935,0.350,-0.120,0.043

0,1,2,3
Omnibus:,283.928,Durbin-Watson:,1.927
Prob(Omnibus):,0.0,Jarque-Bera (JB):,690.078
Skew:,1.259,Prob(JB):,1.42e-150
Kurtosis:,5.722,Cond. No.,6.2


### Model with insignificant variable removed and without Scaling

In [33]:
df = pd.read_csv("insurance.csv")

In [34]:
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [35]:
# Encoding the Categorical data using N-1 dummy encoding
df_cat_encoded=pd.get_dummies(df_cat,drop_first=True)

In [36]:
# Merging the encoded data and numerical data
df_merged = pd.concat([df_num,df_cat_encoded],axis=1)

In [37]:
# Adding beta 0 value as constant
df_merged=sm.add_constant(df_merged)

In [38]:
df_merged

Unnamed: 0,const,age,bmi,bloodpressure,children,claim,gender_male,diabetic_Yes,smoker_Yes,region_northwest,region_southeast,region_southwest
0,1.0,19,27.9,107,0,16884.92,0,0,1,0,0,1
1,1.0,18,33.8,133,1,1725.55,1,0,0,0,1,0
2,1.0,28,33.0,88,3,4449.46,1,1,0,0,1,0
3,1.0,33,22.7,119,0,21984.47,1,1,0,1,0,0
4,1.0,32,28.9,91,0,3866.86,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,1.0,50,31.0,98,3,10600.55,1,0,0,1,0,0
1334,1.0,18,31.9,138,0,2205.98,0,0,0,0,0,0
1335,1.0,18,36.9,114,0,1629.83,0,0,0,0,1,0
1336,1.0,21,25.8,121,0,2007.95,0,1,0,0,0,1


In [39]:
# removing insignificant variables(gender_male, bloodpressure, diabetic_Yes)
df_merged = df_merged.drop(['gender_male','bloodpressure','diabetic_Yes'],axis=1)

In [40]:
df_merged

Unnamed: 0,const,age,bmi,children,claim,smoker_Yes,region_northwest,region_southeast,region_southwest
0,1.0,19,27.9,0,16884.92,1,0,0,1
1,1.0,18,33.8,1,1725.55,0,0,1,0
2,1.0,28,33.0,3,4449.46,0,0,1,0
3,1.0,33,22.7,0,21984.47,0,1,0,0
4,1.0,32,28.9,0,3866.86,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,1.0,50,31.0,3,10600.55,0,1,0,0
1334,1.0,18,31.9,0,2205.98,0,0,0,0
1335,1.0,18,36.9,0,1629.83,0,0,1,0
1336,1.0,21,25.8,0,2007.95,0,0,0,1


In [41]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [42]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.747
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,505.7
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,15:41:39,Log-Likelihood:,-12190.0
No. Observations:,1204,AIC:,24400.0
Df Residuals:,1196,BIC:,24440.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.204e+04,1047.558,-11.492,0.000,-1.41e+04,-9983.409
age,259.5403,12.520,20.730,0.000,234.977,284.104
bmi,338.4505,30.567,11.072,0.000,278.480,398.421
children,473.3447,144.332,3.280,0.001,190.172,756.517
smoker_Yes,2.361e+04,435.113,54.263,0.000,2.28e+04,2.45e+04
region_northwest,-463.9446,500.498,-0.927,0.354,-1445.896,518.007
region_southeast,-1081.0480,508.173,-2.127,0.034,-2078.058,-84.039
region_southwest,-1032.8288,507.441,-2.035,0.042,-2028.402,-37.256

0,1,2,3
Omnibus:,285.921,Durbin-Watson:,1.926
Prob(Omnibus):,0.0,Jarque-Bera (JB):,698.73
Skew:,1.266,Prob(JB):,1.87e-152
Kurtosis:,5.742,Cond. No.,313.0


### Model with insignificant variable removed and with Normalization

In [43]:
df = pd.read_csv("insurance.csv")

In [44]:
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [45]:
# Encoding the Categorical data using N-1 dummy encoding
df_cat_encoded=pd.get_dummies(df_cat,drop_first=True)

In [46]:
# Scaling numerical data using Normaliztion or Min_Max Scaling
df_num_normalized = normalization_fun(df_num)

In [47]:
# Merging the encoded data and Normalized numerical data
df_merged = pd.concat([df_num_normalized,df_cat_encoded],axis=1)

In [48]:
# Adding beta 0 value as constant
df_merged=sm.add_constant(df_merged)

In [49]:
# removing insignificant variables(gender_male, bloodpressure, diabetic_Yes)
df_merged = df_merged.drop(['gender_male','bloodpressure','diabetic_Yes'],axis=1)

In [50]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [51]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.747
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,505.7
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,15:41:39,Log-Likelihood:,1108.7
No. Observations:,1204,AIC:,-2201.0
Df Residuals:,1196,BIC:,-2161.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0491,0.010,-5.072,0.000,-0.068,-0.030
age,0.1906,0.009,20.730,0.000,0.173,0.209
bmi,0.2004,0.018,11.072,0.000,0.165,0.236
children,0.0378,0.012,3.280,0.001,0.015,0.060
smoker_Yes,0.3769,0.007,54.263,0.000,0.363,0.390
region_northwest,-0.0074,0.008,-0.927,0.354,-0.023,0.008
region_southeast,-0.0173,0.008,-2.127,0.034,-0.033,-0.001
region_southwest,-0.0165,0.008,-2.035,0.042,-0.032,-0.001

0,1,2,3
Omnibus:,285.921,Durbin-Watson:,1.926
Prob(Omnibus):,0.0,Jarque-Bera (JB):,698.73
Skew:,1.266,Prob(JB):,1.87e-152
Kurtosis:,5.742,Cond. No.,8.99


### Model with insignificant variable removed and with Standardization

In [52]:
df = pd.read_csv("insurance.csv")

In [53]:
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [54]:
# Encoding the Categorical data using N-1 dummy encoding
df_cat_encoded=pd.get_dummies(df_cat,drop_first=True)

In [55]:
# Scaling numerical data using Standardization or z-score scaling
df_num_standardized = standardization_fun(df_num)

In [56]:
# Merging the encoded data and standardized numerical data
df_merged = pd.concat([df_num_standardized,df_cat_encoded],axis=1)

In [57]:
# Adding beta 0 value as constant
df_merged=sm.add_constant(df_merged)

In [58]:
# removing insignificant variables(gender_male, bloodpressure, diabetic_Yes)
df_merged = df_merged.drop(['gender_male','bloodpressure','diabetic_Yes'],axis=1)

In [59]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [60]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.747
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,505.7
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,15:41:39,Log-Likelihood:,-870.49
No. Observations:,1204,AIC:,1757.0
Df Residuals:,1196,BIC:,1798.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.3499,0.031,-11.378,0.000,-0.410,-0.290
age,0.3011,0.015,20.730,0.000,0.273,0.330
bmi,0.1704,0.015,11.072,0.000,0.140,0.201
children,0.0471,0.014,3.280,0.001,0.019,0.075
smoker_Yes,1.9504,0.036,54.263,0.000,1.880,2.021
region_northwest,-0.0383,0.041,-0.927,0.354,-0.119,0.043
region_southeast,-0.0893,0.042,-2.127,0.034,-0.172,-0.007
region_southwest,-0.0853,0.042,-2.035,0.042,-0.168,-0.003

0,1,2,3
Omnibus:,285.921,Durbin-Watson:,1.926
Prob(Omnibus):,0.0,Jarque-Bera (JB):,698.73
Skew:,1.266,Prob(JB):,1.87e-152
Kurtosis:,5.742,Cond. No.,5.07


### BMI Binning

In [61]:
df = pd.read_csv("insurance.csv")

In [62]:
# Adding new variable 'bmi bin' using pd.cut function on 'bmi' variable
df['bmi bin']=pd.cut(df['bmi'], bins=[0,18.5,25,30,max(df['bmi'])],labels=['Underweight', 'Healthy', 'Overweight', 'Obese'])

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   age            1338 non-null   int64   
 1   gender         1338 non-null   object  
 2   bmi            1338 non-null   float64 
 3   bloodpressure  1338 non-null   int64   
 4   diabetic       1338 non-null   object  
 5   children       1338 non-null   int64   
 6   smoker         1338 non-null   object  
 7   region         1338 non-null   object  
 8   claim          1338 non-null   float64 
 9   bmi bin        1338 non-null   category
dtypes: category(1), float64(2), int64(3), object(4)
memory usage: 95.7+ KB


In [64]:
df_cat = df.select_dtypes(include=['object','category'])
df_num = df.select_dtypes(exclude=['object','category'])

In [65]:
# Removing bmi from numerical data
df_num = df_num.drop('bmi',axis=1)

In [66]:
# Encoding the Categorical data using N-1 dummy encoding
df_cat_encoded=pd.get_dummies(df_cat,drop_first=True)

In [67]:
# Merging the encoded data and numerical data
df_merged = pd.concat([df_num,df_cat_encoded],axis=1)

In [68]:
df_merged=sm.add_constant(df_merged)

In [69]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [70]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.753
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,302.3
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,15:41:41,Log-Likelihood:,-12177.0
No. Observations:,1204,AIC:,24380.0
Df Residuals:,1191,BIC:,24450.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6484.3620,1896.634,-3.419,0.001,-1.02e+04,-2763.247
age,260.6975,12.448,20.943,0.000,236.275,285.120
bloodpressure,6.4535,9.924,0.650,0.516,-13.018,25.925
children,478.7354,143.327,3.340,0.001,197.533,759.938
gender_male,-79.8393,348.552,-0.229,0.819,-763.683,604.005
diabetic_Yes,-268.7355,348.139,-0.772,0.440,-951.770,414.299
smoker_Yes,2.36e+04,433.326,54.463,0.000,2.27e+04,2.45e+04
region_northwest,-527.3607,497.967,-1.059,0.290,-1504.350,449.629
region_southeast,-696.9309,495.714,-1.406,0.160,-1669.501,275.639

0,1,2,3
Omnibus:,305.1,Durbin-Watson:,1.93
Prob(Omnibus):,0.0,Jarque-Bera (JB):,763.717
Skew:,1.34,Prob(JB):,1.45e-166
Kurtosis:,5.836,Cond. No.,1980.0


### BMI Binning with Normalization

In [71]:
df = pd.read_csv("insurance.csv")

In [72]:
# Adding new variable 'bmi bin' using pd.cut function on 'bmi' variable
df['bmi bin']=pd.cut(df['bmi'], bins=[0,18.5,25,30,max(df['bmi'])],labels=['Underweight', 'Healthy', 'Overweight', 'Obese'])

In [73]:
df_cat = df.select_dtypes(include=['object','category'])
df_num = df.select_dtypes(exclude=['object','category'])

In [74]:
# Removing bmi from numerical data
df_num = df_num.drop('bmi',axis=1)

In [75]:
# Encoding the Categorical data using N-1 dummy encoding
df_cat_encoded=pd.get_dummies(df_cat,drop_first=True)

In [76]:
# Scaling numerical data using Normaliztion or Min_Max Scaling
df_num_normalized = pd.DataFrame(MinMaxScaler().fit_transform(df_num.values), columns=df_num.columns, index=df_num.index)

In [77]:
# Merging the encoded data and normalized numerical data
df_merged = normalization_fun(df_num)

In [78]:
df_merged=sm.add_constant(df_merged)

In [79]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [80]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.097
Model:,OLS,Adj. R-squared:,0.095
Method:,Least Squares,F-statistic:,42.99
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,2.18e-26
Time:,15:41:42,Log-Likelihood:,341.76
No. Observations:,1204,AIC:,-675.5
Df Residuals:,1200,BIC:,-655.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0948,0.014,6.732,0.000,0.067,0.122
age,0.1915,0.017,11.079,0.000,0.158,0.225
bloodpressure,0.0018,0.018,0.102,0.919,-0.034,0.037
children,0.0392,0.022,1.806,0.071,-0.003,0.082

0,1,2,3
Omnibus:,365.068,Durbin-Watson:,1.9
Prob(Omnibus):,0.0,Jarque-Bera (JB):,796.406
Skew:,1.747,Prob(JB):,1.16e-173
Kurtosis:,4.916,Cond. No.,5.52


### BMI Binning with Standardization

In [81]:
df = pd.read_csv("insurance.csv")

In [82]:
# Adding new variable 'bmi bin' using pd.cut function on 'bmi' variable
df['bmi bin']=pd.cut(df['bmi'], bins=[0,18.5,25,30,max(df['bmi'])],labels=['Underweight', 'Healthy', 'Overweight', 'Obese'])

In [83]:
df_cat = df.select_dtypes(include=['object','category'])
df_num = df.select_dtypes(exclude=['object','category'])

In [84]:
# Removing bmi from numerical data
df_num = df_num.drop('bmi',axis=1)

In [108]:
# Encoding the Categorical data using One-Hot encoding
df_cat_encoded=pd.get_dummies(df_cat)

In [110]:
df_cat_encoded

Unnamed: 0,gender_female,gender_male,diabetic_No,diabetic_Yes,smoker_No,smoker_Yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,1,0,0,1,0,0,0,1
1,0,1,1,0,1,0,0,0,1,0
2,0,1,0,1,1,0,0,0,1,0
3,0,1,0,1,1,0,0,1,0,0
4,0,1,1,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,0,1,1,0,1,0,0,1,0,0
1334,1,0,1,0,1,0,1,0,0,0
1335,1,0,1,0,1,0,0,0,1,0
1336,1,0,0,1,1,0,0,0,0,1


In [100]:
# Scaling numerical data using Standardization or z-score scaling
df_num_standardized = standardization_fun(df_num)

In [101]:
# Merging the encoded data and standardized numerical data
df_merged = pd.concat([df_num_standardized,df_cat_encoded],axis=1)

In [102]:
df_merged=sm.add_constant(df_merged)

In [103]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [104]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.748
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,353.5
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,15:42:26,Log-Likelihood:,-870.0
No. Observations:,1204,AIC:,1762.0
Df Residuals:,1193,BIC:,1818.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2080,0.007,31.709,0.000,0.195,0.221
age,0.3017,0.015,20.691,0.000,0.273,0.330
bmi,0.1704,0.015,11.023,0.000,0.140,0.201
bloodpressure,0.0092,0.014,0.635,0.526,-0.019,0.038
children,0.0473,0.014,3.289,0.001,0.019,0.076
gender_female,0.1081,0.015,7.157,0.000,0.078,0.138
gender_male,0.0999,0.015,6.803,0.000,0.071,0.129
diabetic_No,0.1144,0.015,7.738,0.000,0.085,0.143
diabetic_Yes,0.0936,0.015,6.254,0.000,0.064,0.123

0,1,2,3
Omnibus:,283.928,Durbin-Watson:,1.927
Prob(Omnibus):,0.0,Jarque-Bera (JB):,690.078
Skew:,1.259,Prob(JB):,1.42e-150
Kurtosis:,5.722,Cond. No.,4.22e+16


### Q7. Z = bmi * smoker_Yes

In [105]:
df = pd.read_csv("insurance.csv")

In [106]:
df_cat = df.select_dtypes(include=['object','category'])
df_num = df.select_dtypes(exclude=['object','category'])

In [107]:
# Encoding the Categorical data using N-1 dummy encoding
df_cat_encoded=pd.get_dummies(df_cat,drop_first=True)

In [94]:
# Merging the encoded data and numerical data
df_merged = pd.concat([df_num,df_cat_encoded],axis=1)

In [95]:
df_merged=sm.add_constant(df_merged)

In [96]:
# Adding new variable Z = bmi * smoker_Yes
df_merged['z']= df_merged['bmi']*df_merged['smoker_Yes']

In [97]:
# Calling the runner function and getting the model as return.
model = runner(df_merged)

In [98]:
model.summary()

0,1,2,3
Dep. Variable:,claim,R-squared:,0.838
Model:,OLS,Adj. R-squared:,0.836
Method:,Least Squares,F-statistic:,559.1
Date:,"Tue, 19 Jan 2021",Prob (F-statistic):,0.0
Time:,15:41:42,Log-Likelihood:,-11924.0
No. Observations:,1204,AIC:,23870.0
Df Residuals:,1192,BIC:,23930.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2581.3869,1327.328,-1.945,0.052,-5185.546,22.772
age,264.5344,10.086,26.228,0.000,244.746,284.323
bmi,29.1299,27.417,1.062,0.288,-24.662,82.922
bloodpressure,2.3162,8.024,0.289,0.773,-13.426,18.058
children,567.0424,116.082,4.885,0.000,339.295,794.790
gender_male,-591.1028,282.993,-2.089,0.037,-1146.322,-35.883
diabetic_Yes,-155.8563,281.977,-0.553,0.581,-709.083,397.370
smoker_Yes,-2.122e+04,1779.753,-11.921,0.000,-2.47e+04,-1.77e+04
region_northwest,-620.5513,403.107,-1.539,0.124,-1411.429,170.327

0,1,2,3
Omnibus:,667.475,Durbin-Watson:,1.956
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4321.743
Skew:,2.586,Prob(JB):,0.0
Kurtosis:,10.707,Cond. No.,1610.0


Featured Engineering