In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from scipy import stats
import statsmodels.stats.api as sms
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
from sklearn.metrics import r2_score

### Importing Dataset

In [None]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')
df.head()

In [None]:
df.info()

### EDA

In [None]:
df = pd.get_dummies(df, prefix = ["sex", "smoker", "region"], drop_first=True)

In [None]:
df.head()

#### Charges for Smokers and Non-Smokers

In [None]:
fig, axes = plt.subplots(1,2, figsize=(14,5))

sns.distplot(df[df['smoker_yes']==1]['charges'], ax=axes[0], color='r').set_title('Distribution of Charges for Smokers')

sns.distplot(df[df['smoker_yes']==0]['charges'], ax=axes[1], color='g')
plt.title('Distribution of Charges for Non-Smokers')
plt.show()

For  maximum number of smokers the charges lie between 15000 to 50000, thus revealing that smokers face charges higher than that of non-smokers.

For non-smokers the graph shows that the target variable is skewed to the right i.e. maximum number of non-smokers face little charges. The graph also shows that there are a many outliers in the data.

#### Charges for Males vs Females

In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(df['sex_male'], df['charges'])
plt.grid()
plt.show()

There is not much difference in the charges between the charges of men and women. We could also observe the presence of a lot of outliers specially in Female part.

#### BMI vs Charges

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='bmi', y='charges', data=df)
plt.show()

BMI above 30 is considered obese and we can see that as the BMI increases above 30 the charges rate shoots up.

#### Variation of Charges with number of Children

In [None]:
plt.figure(figsize=(15,7))
sns.violinplot(x='children', y='charges', data=df)
plt.show()

The violin plot clearly shows that people with 5 children have the lowest charges, but it is difficult to say anything about the highest charges. Lets plor a barchart to clarify things.

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(df.groupby('children').mean()['charges'].index, df.groupby('children').mean()['charges'].values)
plt.grid()
plt.show()

Bingo.. The barchart clearly shows that people with 3 children have the highest charges. It also validates the result of the violinplot saying that people with 5 children have the lowest charges.

#### Correlation Matrix

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

The highest correlation with Charges is with Smokers and the lowest correlation is with different regions.

### Pre-processing the Data

The categorical features were converted into dummy features as the first step towards pre-processing.

The next step would be towards defining the features as independent and dependent in the form of X and y respectively, and finally scale the independent features X.

In [None]:
X = df.drop('charges', axis=1)
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

ss = StandardScaler()

X_trains = ss.fit_transform(X_train)
X_tests = ss.transform(X_test)

## Regression -  Statistics Approach 

In [None]:
Xc = sm.add_constant(X)
model = sm.OLS(y, Xc)
lr = model.fit()
lr.summary()

#### Conclusions from the Statisticale Summary

1. The pvalue for the sex region is 0.692 and hence it is irrelevant. We had also deduced the same from the visualization that the charges was not biased to any gender.
2. The charges was very much dependent on smoking and hence the pvalue for that region is 0.00.
3. The pvalue for the children is also 0 hence we can confidently say that the charges vary with the number of children a person has. Same goes with age and BMI.
4. Coming to the region column as the pvalue is large it can be again concluded that the variation in charges is not dependent on the region.
5. To our surprise all the visualization conclusion allign with the statistical summary, but it should be kept in mind that the visualization sometimes may be confusing and may not always tell us what really is going on. We always should perform statistical analysis to confirm our beliefs.

### Assumptions of Linear Regression

#### 1. Multicollinearity

In [None]:
vif = [vif(Xc.values, i) for i in range(Xc.shape[1])]
pd.DataFrame(vif, index=Xc.columns, columns=['VIF'])

All the features have VIF below 5. So we can safely conclude that all the features are relevant as none of them show multimollinearity.

#### 2. Linearity of the relationship

In [None]:
pred = lr.predict()
sns.regplot(x=pred, y=y, line_kws={'color':'red'})
plt.show()

In [None]:
fstat, pvalue = sm.stats.diagnostic.linear_rainbow(lr)
print("The p-value is: ",pvalue)

The pvalue is above 0.05, hence we can conclude that our Null Hypothesis is true which is the fit of the model using full sample is the same as using a central subset and hence a Linear regression model can be built.

#### 3. Normality of the residuals

In [None]:
stats.probplot(lr.resid, plot=plt)
plt.show()

In [None]:
sns.distplot(lr.resid)
plt.show()

In [None]:
stat, pvalue = stats.jarque_bera(lr.resid)

print("The p-value is: ",pvalue)

The pvalue is much below the significance level. Hence the residuals are not normally distributed and we need to normalize it before proceeding further.

#### 4. Homoscedasticity

In [None]:
sns.residplot(lr.predict(), lr.resid, lowess=True, line_kws={'color':'red'})
plt.show()

In [None]:
fval, pval, res = sms.het_goldfeldquandt(lr.resid, Xc)

print("The p-value is: ",pval)

As the pvalue is above the significance level, hence we conclude that we fail to reject the null hypothesis which is the variance of errors is constant across the range of data.

#### 5. Autocollinearity

The Durbin Watson value of 2.088 in the model summary shows that there is no Autocollinearity.

#### Removing insignificant Features
Now we try to remove the insignificant features whose pvalue is greater than 0.05 in the t-test performed in the statistical summary.

In [None]:
while (len(X.columns)>0):
    Xc1 = sm.add_constant(X)
    ols = sm.OLS(y, Xc1)
    model = ols.fit()
    f = model.pvalues[1:].idxmax()
    if (model.pvalues[1:].max()>0.05):
        X = X.drop(f, axis=1)
    else:
        break

print("The final features are:",X.columns)

In [None]:
Xc2 = sm.add_constant(X)
ols = sm.OLS(y, Xc2)
lr = ols.fit()
lr.summary()

#### The Rsquare value is 0.75.

In [None]:
error = lr.resid
mse = np.mean(error**2)
rmse = np.sqrt(mse)
rmse

#### The Root Mean Squared Error is 6056.

### Regression - Machine Learning Approach

#### Linear Regression

In [None]:
X = df.drop(['charges'], axis = 1)
y = df.charges

X_train,X_test,y_train,y_test = train_test_split(X,y, random_state = 0)
lr = LinearRegression().fit(X_train,y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

print("The score is:",lr.score(X_test,y_test))
print("The RMSE for the training set is:",np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("The RMSE for the testing set is:",np.sqrt(mean_squared_error(y_test, y_test_pred)))

#### A little bit of Preprocessing

In [None]:
quad = PolynomialFeatures (degree = 2)
x_quad = quad.fit_transform(X)

X_train,X_test,y_train,y_test = train_test_split(x_quad,y, random_state = 0)

plr = LinearRegression().fit(X_train,y_train)

y_train_pred = plr.predict(X_train)
y_test_pred = plr.predict(X_test)

rmseLinear = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("The score is:",plr.score(X_test,y_test))
print("The RMSE for the training set is:",np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("The RMSE for the testing set is:",np.sqrt(mean_squared_error(y_test, y_test_pred)))

A little step of preprocessing can give us wonderful results. This shows the importance of preprocessing.

#### Feature Selection
Recursive Feature Selection

In [None]:
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=4)
rfe.fit(X, y)
pd.DataFrame(rfe.ranking_, index=X.columns, columns=['Select'])

Hyperparameter Tuning is required as the number of features to select is not known. We will use GridSearchCV to tune the hyperparameters.

In [None]:
lr = LinearRegression()
param_grid = [{'n_features_to_select':list(range(1,len(df.columns)+1))}]

rfe = RFE(lr)
gsearch = GridSearchCV(rfe, param_grid=param_grid, cv=3, return_train_score=True)
gsearch.fit(X, y)

In [None]:
print(gsearch.best_params_)
pd.DataFrame(gsearch.cv_results_)

Putting the n_features_to_select value as best_params and building the model.

In [None]:
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=8)
rfe.fit(X, y)
pd.DataFrame(rfe.ranking_, index=X.columns, columns=['Rank'])

Finally after finding the best features, we move towards regularization methods.

### Regularization

#### Lasso:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_quad, y, random_state = 0)

lassoModel = Lasso(max_iter=5000)
lasso = lassoModel.fit(X_train, y_train)
lassoPred = lasso.predict(X_test)
mseLasso = mean_squared_error(y_test, lassoPred)
rmseLasso = mseLasso**(1/2)

print("The RMSE for the model is:",rmseLasso)
print("The Rsquare for the model is:",lasso.score(X_test, y_test))

The rmse is 4267 for Lasso Regression.

#### Ridge:

In [None]:
ridgeModel = Ridge(max_iter=5000)
ridge = ridgeModel.fit(X_train, y_train)
ridgePred = ridge.predict(X_test)
mseRidge = mean_squared_error(y_test, ridgePred)
rmseRidge = mseRidge**(1/2)

print("The RMSE for the model is:",rmseRidge)
print("The Rsquare for the model is:",ridge.score(X_test, y_test))

The RMSE is 4278 for Ridge Regression.

#### ElasticNet

In [None]:
elasticNetModel = ElasticNet(alpha = 0.01, l1_ratio = 0.9, max_iter = 5000)
ElasticNet = elasticNetModel.fit(X_train, y_train)
ElasticNetPred = ElasticNet.predict(X_test)
mseElasticNet = mean_squared_error(y_test, ElasticNetPred)
rmseElasticNet = mseElasticNet**(1/2)

print("The RMSE for the model is:",rmseElasticNet)
print("The Rsquare for the model is:",ElasticNet.score(X_test, y_test))

The RMSE is 4278 for Ridge Regression.

#### It is to be seen that there is almost negligible difference between the results of Lasso, Ridge and ElasticNet Regressions.

In [None]:
performanceData = pd.DataFrame({"Regrssion":["Linear", "Lasso", "Ridge", "Elasticnet"], 
                                "RMSE":[rmseLinear, rmseLasso, rmseRidge, rmseElasticNet]})
performanceData