### Supress Warnings and import all the relevant packages and libraries

In [None]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import the necessary packages and libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 50)

from sklearn.model_selection import train_test_split

import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

plt.style.use('ggplot')

### Reading and preparing the data

In [None]:
# Read the data

insurance = pd.read_csv('../input/insurance-premium-prediction/insurance.csv')
insurance.head()

In [None]:
# Checking the shape of the dataframe

insurance.shape

So there are 1338 records and 7 features.

In [None]:
# Checking the info for all the columns

insurance.info()

### Identifying the categorical and continuous variables

In [None]:
insurance.nunique().sort_values()

### Visualising the numeric variables

In [None]:
# Pairplot of all the numeric variables

sns.pairplot(insurance, vars=['age','bmi','expenses'])
plt.show()

### Visualising the categorical variables

In [None]:
#Boxplot for some of the categorical variables with respect to the target varibale 'expenses'

plt.figure(figsize=(20, 12))
plt.subplot(2,4,1)
sns.boxplot(x = 'sex', y = 'expenses', data = insurance)
plt.subplot(2,4,2)
sns.boxplot(x = 'children', y = 'expenses', data = insurance)
plt.subplot(2,4,3)
sns.boxplot(x = 'smoker', y = 'expenses', data = insurance)
plt.subplot(2,4,4)
sns.boxplot(x = 'region', y = 'expenses', data = insurance)

plt.show()

### Analysis between the target variable - expenses, and the other variables

In [None]:
# Analysis between sex and expenses

plt.figure(figsize=(10,4))
sns.barplot('sex','expenses',data=insurance)
plt.title('Expenses among the genders',fontsize=12)
plt.show()

In [None]:
# Analysis between children and expenses

plt.figure(figsize=(10,4))
sns.barplot('children','expenses',data=insurance)
plt.title('Expenses with respect to the number of children',fontsize=12)
plt.show()

In [None]:
# Analysis between smoker and expenses

plt.figure(figsize=(10,4))
sns.barplot('smoker','expenses',data=insurance)
plt.title('Expenses with respect to smoker',fontsize=12)
plt.show()

In [None]:
# Analysis between region and expenses

plt.figure(figsize=(10,4))
sns.barplot('region','expenses',data=insurance)
plt.title('Expenses with respect to region',fontsize=12)
plt.show()

Observation:-
Expenses are highest for those individuals who belong to the south-east region.

In [None]:
# Analysis of expenses with age

sns.scatterplot(x='age',y='expenses' ,data=insurance)
plt.title('Expenses vs Age')
plt.show()

In [None]:
# Analysis of expenses with bmi

sns.scatterplot(x='bmi',y='expenses' ,data=insurance)
plt.title('BMI vs Expenses')
plt.show()

### Correlation between the variables

In [None]:
# Heatmap to visualise the correlation between the variables

plt.figure(figsize=(10, 5))
sns.heatmap(insurance.corr(), cmap="YlGnBu", annot = True)
plt.title("Correlation between the variables")
plt.show()

In [None]:
# Mapping the variable 'children' for better analysis

insurance['children'] = insurance.children.map({0:'No Children',1:'One Child',2:'Two Children',3:'Three Children',4:'Four Children',5:'Five Children'})
insurance.head()

### Creating Dummy Variables

Creating dummy variables for - children, region, sex and smoker

In [None]:
children_dummy = pd.get_dummies(insurance.children,drop_first=True)
region_dummy = pd.get_dummies(insurance.region,drop_first=True)
sex_dummy = pd.get_dummies(insurance.sex,drop_first=True)
smoker_dummy = pd.get_dummies(insurance.smoker,prefix='smoker',drop_first=True)

In [None]:
# Adding the dummy variables to the original dataframe

insurance = pd.concat([insurance,children_dummy,region_dummy,sex_dummy,smoker_dummy],axis=1)
insurance.head()

In [None]:
# Dropping the original columns - children, region, sex and smoker, since dummy variables have already been created for them

insurance.drop(['children','region','sex','smoker'], axis = 1, inplace = True)
insurance.shape

### Splitting the Data into Training and Testing Sets

In [None]:
np.random.seed(0)

insurance_train, insurance_test = train_test_split(insurance, train_size = 0.7, random_state = 100)

print(insurance_train.shape)
print(insurance_test.shape)

### Rescaling the Features using MinMax Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
# Creating a list of numeric variables

num_vars=['age','bmi','expenses']

In [None]:
# Fit the data

insurance_train[num_vars] = scaler.fit_transform(insurance_train[num_vars])
insurance_train.head()

In [None]:
# Checking numeric variables after scaling the features
insurance_train.describe()

### Dividing into X and y sets for the model building

In [None]:
y_train = insurance_train.pop('expenses')
X_train = insurance_train

In [None]:
col = X_train.columns
col

### Building our model

We will be using the statsmodel to build our model, initially with all the features, and keep on removing them manually, based on p-values and VIF.

In [None]:
# Creating the linear model

lm = LinearRegression()
lm.fit(X_train, y_train)

### Model 1

In [None]:
# Adding a constant variable 
  
X_train_1 = X_train[col]

In [None]:
X_train_1 = sm.add_constant(X_train_1)

In [None]:
# Running the linear model

lm = sm.OLS(y_train,X_train_1).fit()

In [None]:
# Summary of our linear model
print(lm.summary())

In [None]:
# Dropping the const variable

X_train_1_ = X_train_1.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_1_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

We will be following the below rule to drop the variables one by one, as per the priorities mentioned by their sequences:-

* We will first check the summary and VIF
* If a variable has got high p-value(>0.05) as well as high VIF(>5), we need to drop that first
* If a variable has got high p-value(>0.05) but low VIF(<5), then we need to drop such
* Still if we have a variable with low p-value(<0.05) but high VIF(>5), we need to drop such at the very end

### Model 2

In [None]:
# Rebuilding the model without 'male'

X_train_2 = X_train_1_.drop(['male'], axis=1)

In [None]:
# Adding the contsant variable

X_train_2 = sm.add_constant(X_train_2)

In [None]:
# Running the linear model

lm = sm.OLS(y_train,X_train_2).fit() 

In [None]:
# Summary of the new model
print(lm.summary())

In [None]:
# Dropping the const variable

X_train_2_ = X_train_2.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_2_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Model 3

In [None]:
# Rebuilding the model without 'Two Children'

X_train_3 = X_train_2_.drop(['Two Children'], axis=1)

In [None]:
# Adding the contsant variable

X_train_3 = sm.add_constant(X_train_3)

In [None]:
# Running the linear model

lm = sm.OLS(y_train,X_train_3).fit()

In [None]:
# Summary of the new model
print(lm.summary())

In [None]:
# Dropping the const variable

X_train_3_ = X_train_3.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_3_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Model 4

In [None]:
# Rebuilding the model without 'Four Children'

X_train_4 = X_train_3_.drop(['Four Children'], axis=1)

In [None]:
# Adding the contsant variable

X_train_4 = sm.add_constant(X_train_4)

In [None]:
# Running the linear model

lm = sm.OLS(y_train,X_train_4).fit()

In [None]:
# Summary of the new model
print(lm.summary())

In [None]:
# Dropping the const variable

X_train_4_ = X_train_4.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_4_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Model 5

In [None]:
# Rebuilding the model without 'Three Children'

X_train_5 = X_train_4_.drop(['Three Children'], axis=1)

In [None]:
# Adding the contsant variable

X_train_5 = sm.add_constant(X_train_5)

In [None]:
# Running the linear model

lm = sm.OLS(y_train,X_train_5).fit()

In [None]:
# Summary of the new model
print(lm.summary())

In [None]:
# Dropping the const variable

X_train_5_ = X_train_5.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_5_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Model 6

In [None]:
# Rebuilding the model without 'northwest'

X_train_6 = X_train_5_.drop(['northwest'], axis=1)

In [None]:
# Adding the contsant variable

X_train_6 = sm.add_constant(X_train_6)

In [None]:
# Running the linear model

lm = sm.OLS(y_train,X_train_6).fit()

In [None]:
# Summary of the new model
print(lm.summary())

In [None]:
# Dropping the const variable

X_train_6_ = X_train_6.drop(['const'], axis=1)


In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_6_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Model 7

In [None]:
# Rebuilding the model without 'southeast'

X_train_7 = X_train_6_.drop(['southeast'], axis=1)

In [None]:
# Adding the contsant variable

X_train_7 = sm.add_constant(X_train_7)

In [None]:
# Running the linear model

lm = sm.OLS(y_train,X_train_7).fit()

In [None]:
# Summary of the new model
print(lm.summary())

In [None]:
# Dropping the const variable

X_train_7_ = X_train_7.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_7_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Residual Analysis of the train data

So, now to check if the error terms are also normally distributed (which is infact, one of the major assumptions of linear regression), let us plot the histogram of the error terms and see what it looks like.

In [None]:
lm = sm.OLS(y_train,X_train_7).fit()  #As obtained previously
y_train_count = lm.predict(X_train_7)

In [None]:
# Plot the histogram of the error terms

fig = plt.figure()
sns.distplot((y_train - y_train_count), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)                         # X-label
plt.show()

### Applying the scaling on the test sets

In [None]:
num_vars=['age','bmi','expenses']

# Fit and transform operations are done on the training data but only transform operation will be done on the test data

insurance_test[num_vars] = scaler.transform(insurance_test[num_vars])
insurance_test.describe()

### Dividing into X_test and y_test

In [None]:
y_test = insurance_test.pop('expenses')
X_test = insurance_test

In [None]:
# Creating X_test_m7 dataframe with the final Model 7 in the training dataset

X_test_m7 = sm.add_constant(X_test)

In [None]:
X_test_m7.columns

In [None]:
X_test_m7 = X_test_m7.drop(['Four Children','Three Children','Two Children','northwest','southeast','male'],axis=1)

In [None]:
# Making predictions using the seventh model

y_pred_m7 = lm.predict(X_test_m7)

### Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread

fig = plt.figure()
plt.scatter(y_test, y_pred_m7)
fig.suptitle('y_test vs y_pred', fontsize = 20)              # Plot heading 
plt.xlabel('y_test', fontsize = 18)                          # X-label
plt.ylabel('y_pred', fontsize = 16)      
plt.show()

In [None]:
# Regression plot

sns.regplot(x = y_test, y = y_pred_m7, fit_reg=True,scatter_kws={"color": "blue"}, line_kws={"color": "red"})

plt.title('y_test vs y_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_pred', fontsize=16)                          # Y-label
plt.show()

### Calculation of R-square and Adjusted R-square values

In [None]:
# Evaluate R-square for test dataset

from sklearn.metrics import r2_score
r2_score(y_test,y_pred_m7)

In [None]:
# Adjusted R^2
# adj r2 = 1-((1-R2)*(n-1)/(n-p-1))

# n = sample size (in this case the value is 220, as yielded before)
# p = number of independent variables(in this case the value is 9)

Adj_r2 = 1 - ((1 - 0.7733792659357421) * 401 / (402-6-1))
print(Adj_r2)

737For the training dataset, the R^2 value was 0.738 and adjusted R^2 value was 0.737.

For the testing dataset, the R^2 value obtained is 0.773 and adjusted R^2 value obtained is 0.769.




Hence the equation of our best fitted line is:-

$ expenses = 0.1912 \times age + 0.1644 \times bmi - 0.0214 \times No   Children -0.0278 \times One   Child - 0.0160 \times southwest + 0.3817 \times smoker_   yes $


Overall we have a decent model, but we also acknowledge that we could do better. 

### Interpretations

* We have arrived at a very decent model for the the demand for shared bikes with the significant variables.

* We can see that smoking_yes variable is having the highest coefficient of 0.3817, which means if the smoking_yes increases by one unit, the expense increases by 0.3817 units.

* The other significant variables having positive coefficients are age and bmi.

* There are some variables with negative coefficients too, like No Children, One Child and southwest. A negative coefficient suggests that, as the independent variable increases, the dependent variable tends to decrease, and vice-versa.