In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# To enable plotting graphs in Jupyter notebook
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# To create linear regression model
from sklearn.linear_model import LinearRegression

# To check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Data import and initial EDA

In [None]:
df = pd.read_csv('../input/insurance-premium-prediction/insurance.csv')

First, we will attempt to understand the structural details of the data

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
df.shape

## Missing Values 

In [None]:
df.isna().sum()

there dont seem to be any rows with null data

In [None]:
df[df.duplicated()] #Check for Duplicated values

In [None]:
# Remove the one row
df = df.drop_duplicates()

# Confirm duplicates are gone
df.duplicated().sum()

### understanding high level details of the features

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x) # to display numbers in digits
df.describe(include='all').T

1. The age seems to vary from 18 to 64 years with the mean and median falling at roughly 39 years
2. The bmi information seems to be on the higher side for the given dataset with the mean and median at 30
3. The expenses feature seems to have a right skew as the values seem to be on the higher side with this dataset

In [None]:
df.describe(include='object').T

## Univariate Analysis

In [None]:
# While doing uni-variate analysis of numerical variables we want to study their central tendency 
# and dispersion.
# Let us write a function that will help us create boxplot and histogram for any input numerical 
# variable.
# This function takes the numerical column as the input and returns the boxplots 
# and histograms for the variable.
# Let us see if this help us write faster and cleaner code.
def histogram_boxplot(feature, figsize=(15,10), bins = None):
    """ Boxplot and histogram combined
    feature: 1-d feature array
    figsize: size of fig (default (9,8))
    bins: number of bins (default None / auto)
    """
    sns.set(font_scale=2) # setting the font scale  of the seaborn
    f2, (ax_box2, ax_hist2) = plt.subplots(nrows = 2, # Number of rows of the subplot grid= 2
                                           sharex = True, # x-axis will be shared among all subplots
                                           gridspec_kw = {"height_ratios": (.25, .75)}, 
                                           figsize = figsize 
                                           ) # creating the 2 subplots
    sns.boxplot(feature, ax=ax_box2, showmeans=True, color='red') # boxplot will be created and a star will indicate the mean value of the column
    sns.distplot(feature, kde=F, ax=ax_hist2, bins=bins) if bins else sns.distplot(feature, kde=False, ax=ax_hist2) # For histogram
    ax_hist2.axvline(np.mean(feature), color='g', linestyle='--') # Add mean to the histogram
    ax_hist2.axvline(np.median(feature), color='black', linestyle='-') # Add median to the histogram

In [None]:
histogram_boxplot(df['age'])

In [None]:
histogram_boxplot(df['bmi'])

In [None]:
histogram_boxplot(df['children'])

In [None]:
histogram_boxplot(df['expenses'])

## Distribution of numerical variables

In [None]:
# lets plot histogram of all plots
from scipy.stats import norm
all_col = df.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(17,75))

for i in range(len(all_col)):
    plt.subplot(18,3,i+1)
    plt.hist(df[all_col[i]])
    #sns.displot(df[all_col[i]], kde=True)
    plt.tight_layout()
    plt.title(all_col[i],fontsize=25)
    

plt.show()

## Outlier Analysis

In [None]:
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()

# outlier detection using boxplot
plt.figure(figsize=(20,30))

for i, variable in enumerate(numeric_columns):
                     plt.subplot(5,4,i+1)
                     plt.boxplot(df[variable],whis=1.5)
                     plt.tight_layout()
                     plt.title(variable)

plt.show()

Though there seem to be some outliers in the bmi and the expenses, these seem to be logical values and it may make sense to continue to maintain them for further analysis

## Bivariate Analysis

In [None]:
corr = df[numeric_columns].corr()


# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(28, 15))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, 
            cmap='seismic', annot=True,fmt=".1f",vmin=-1, vmax=1, center= 0,
            square=False, linewidths=.7, cbar_kws={"shrink": .5});

In [None]:
corr['expenses'].sort_values(ascending = False)

## Let us look at the graph of those variables that are highly correlated with age

### Age vs. BMI 

In [None]:
plt.figure(figsize=(15,13))
sns.scatterplot(y='expenses', x='age', hue='bmi', data=df);

## Model Building

In [None]:
X = df.drop('expenses',axis = 1)
y = df['expenses']

### Create Dummy Variables

In [None]:
X = pd.get_dummies(X, columns=['sex', 'smoker','region'], drop_first=True)
X.head()

In [None]:
#split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Choose Model, train and evaluate

In [None]:
#Fitting linear model
lin_reg_model = LinearRegression()                                    
lin_reg_model.fit(X_train, y_train)                                  

#pred = linearregression.predict(X_test)    

In [None]:
# let us check the coefficients and intercept of the model

coef_df = pd.DataFrame(np.append(lin_reg_model.coef_.flatten(), lin_reg_model.intercept_), \
                       index=X_train.columns.tolist()+['Intercept'], columns=['Coefficients'])
print(coef_df)

### Evaluating Model performance

In [None]:
# MAPE
def mape(targets, predictions):
    return np.mean(np.abs((targets - predictions)) / targets) * 100

# Adjusted R^2
def adj_r2(ind_vars, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = ind_vars.shape[0]
    k = ind_vars.shape[1]
    return 1-((1-r2)*(n-1)/(n-k-1))

# Model performance check
def model_perf(model, inp, out):

    y_pred = model.predict(inp)
    y_act = out.values

    return pd.DataFrame({
                "RMSE": np.sqrt(mean_squared_error(y_act, y_pred)),
                "MAE": mean_absolute_error(y_act, y_pred),
                "MAPE": mape(y_act, y_pred),
                "R^2": r2_score(y_act, y_pred),
                "Adjusted R^2": adj_r2(inp, y_act, y_pred)
           }, index=[0])

In [None]:
# Checking model performance on train set
print('Training Performance\n')
print(model_perf(lin_reg_model, X_train, y_train))

In [None]:
# Checking model performance on test set
print('Test Performance\n')
print(model_perf(lin_reg_model, X_test, y_test))

### Multicollinearity Test

In [None]:
# to compute VIF, we first have to add a constant column having value 1 to our input variables

X1 = X.copy()
X1['const'] = 1
X1.head()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_series1 = pd.Series([variance_inflation_factor(X1.values,i) for i in range(X1.shape[1])],index=X1.columns)
print('Series before feature selection: \n\n{}\n'.format(vif_series1))

In [None]:
# Method to drop all the multicollinear columns and choose which one we should drop
def treating_multicollinearity(high_vif_columns, x_train, x_test, y_train, y_test):
    """
    Drop every column that has VIF score greater than 5, one by one.
    Look at the adjusted R square of all these models
    Look at the RMSE of all these models on test data
    """
    adj_rsq_scores = []
    rmse_test_data = []

    # build ols models by dropping one of these at a time and observe the Adjusted R-squared
    for cols in high_vif_columns:
        train = x_train.loc[:, ~x_train.columns.str.startswith(cols)]
        test = x_test.loc[:, ~x_test.columns.str.startswith(cols)]
        # Create the model
        
        model = LinearRegression(fit_intercept=False)
        model.fit(train, y_train)
        # Adj R-Sq
        adj_rsq_scores.append(adj_r2(train, y_train, model.predict(train)))
        # RMSE (Test data)
        y_pred_test_Price = model.predict(test)
        y_test_Price = y_test
        rmse_test_data.append(np.sqrt(mean_squared_error(y_pred_test_Price, y_test_Price)))

    # Add new Adj_Rsq and RMSE after dropping each colmn
    temp = pd.DataFrame(
        {
            "col": high_vif_columns,
            "Adj_rsq_after_dropping_col": adj_rsq_scores,
            "Test RMSE": rmse_test_data,
        }
    ).sort_values(by="Adj_rsq_after_dropping_col", ascending=False)

    print(temp)

In [None]:
high_vif_columns = ["children"]

X_train1 = X_train.copy()
X_train1['constant'] = 1

X_test1 = X_test.copy()
X_test1['constant'] = 1

treating_multicollinearity(high_vif_columns, X_train1, X_test1, y_train, y_test)

In [None]:
lin_reg_model1 = LinearRegression(fit_intercept=False)
lin_reg_model1.fit(X_train1,y_train)

print('Training Performance\n')
print(model_perf(lin_reg_model1, X_train1, y_train))

In [None]:
# predicted values
fitted = lin_reg_model1.predict(X_train1)
residual = fitted - y_train.values

np.mean(residual)

In [None]:
print('Training Performance\n')
print(model_perf(lin_reg_model1, X_train1, y_train))

In [None]:
print('Test Performance\n')
print(model_perf(lin_reg_model1, X_test1, y_test))

In [None]:
lin_reg_model1 = LinearRegression(fit_intercept=False)
lin_reg_model1.fit(X_train1,y_train)

# let us check the coefficients and intercept of the model

coef_df = pd.DataFrame(lin_reg_model1.coef_.flatten(), \
                       index=X_train1.columns.tolist(), columns=['Coefficients'])
print(coef_df)

# model performance
print('\n\nTraining Performance\n')
print(model_perf(lin_reg_model1, X_train1, y_train))

In [None]:
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm
from sklearn import preprocessing

### Select features and Find the best hyper-parameters for the regressors


In [None]:
# Linear Regression
X1 = sm.add_constant(X_train)
ols = sm.OLS(y_train,X1)
lr = ols.fit()

selected_features = list(X.columns)
pmax = 1
while (len(selected_features)>0):
    p= []
    X_1 = X[selected_features]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = selected_features)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        selected_features.remove(feature_with_p_max)
    else:
        break  
        
print('The selected features are :', selected_features)

In [None]:
print(model.pvalues)

In [None]:
# XGboost
tuned_parameters = [{'max_depth': [5,10, 15, 20, 25, 30],'learning_rate':[0.001, 0.01, 0.1, 0.5], 'n_estimators': [100,150,200, 250, 300]}]
MSE_xgb = ['mean_squared_error(y_test,y_pred2)']
for value in MSE_xgb:
    regr = GridSearchCV(xgb.XGBRegressor(silent = True), tuned_parameters, cv=4)
    regr.fit(X_train, y_train)
    y_true, y_pred2 = y_test, regr.predict(X_test)
    
print('The best hyper-parameters for XGBBoost are: ',regr.best_params_)

In [None]:
# AdaBoost
tuned_parameters = [{'learning_rate': [0.1,1,2,3,4,5], 'n_estimators': [100,200,300, 400, 500]}]
MSE_ada = ['mean_squared_error(y_test,y_pred3)']
for value in MSE_ada:
    adaregr = GridSearchCV(AdaBoostRegressor(), tuned_parameters, cv=4)
    adaregr.fit(X_train, y_train)
    y_true, y_pred3 = y_test, adaregr.predict(X_test)
    
print('The best hyper-parameters for AdaBoost are: ', adaregr.best_params_ )

In [None]:
# Decision Tree
tuned_parameters = [{'max_depth': [1,2,3,4,5,10, 15, 20, 25, 50, 100,200]}]
MSE_dt = ['mean_squared_error(y_test,y_pred4)']
for value in MSE_dt:
    regressor_dt = GridSearchCV(DecisionTreeRegressor(), tuned_parameters, cv=4)
    regressor_dt.fit(X_train, y_train)
    y_true, y_pred4 = y_test, regressor_dt.predict(X_test)
    
print('The optimum max_depth for Decision Tree is: ', regressor_dt.best_params_ )

In [None]:
# Random Forests
tuned_parameters = [{'max_depth': [5,10, 15, 20, 50, 70], 'n_estimators': [10, 25, 50, 100,150, 200, 250]}]
MSE_rf = ['mean_squared_error(y_test, y_pred5)']
for value in MSE_rf:
    regr_rf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=4)
    regr_rf.fit(X_train, y_train)
    y_true, y_pred5 = y_test, regr_rf.predict(X_test)
    
print('The best hyper-parameters for Random Forests are: ',regr_rf.best_params_)

In [None]:
# KNN
scaler = preprocessing.RobustScaler()                         # Features Scaling is required for distance-based algorithms
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

tuned_parameters = [{'n_neighbors': [1,2,3,4,5,10,15,20], 'p': [1,2]}]
MSE_knn = ['mean_squared_error(y_test,y_pred)']
for i in MSE_knn:
    model = GridSearchCV(KNeighborsRegressor(), tuned_parameters, cv=4)
    model.fit(X_train_scaled, y_train)
    y_true, y_pred6 = y_test, model.predict(X_test_scaled)
    
print('The best hyper-parameters for KNN are: ', model.best_params_)

In [None]:
# SVM
tuned_parameters = [{'kernel': ['linear', 'rbf', 'poly'], 'C':[1, 2, 3, 5, 6, 7, 10], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1]}]
MSE_svm = ['mean_squared_error(y_test,y_pred7)']
for value in MSE_svm:
    svr_regr = GridSearchCV(SVR(), tuned_parameters, cv=4)
    svr_regr.fit(X_train_scaled, y_train)
    y_true, y_pred7 = y_test, svr_regr.predict(X_test_scaled)
    
print('The best hyper-parameters for SVR are: ', svr_regr.best_params_)

In [None]:
# Build the models using the tuned hyper-parameters and fit


# Linear Regression
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

# XGBoost
regr = xgb.XGBRegressor(learning_rate=0.1, max_depth=5, n_estimators=150, random_state = 0, silent = True)
regr.fit(X_train, y_train)

# AdaBoost
adaregr = AdaBoostRegressor(random_state=0, learning_rate = 3, n_estimators=200)
adaregr.fit(X_train, y_train)

# Decision Tree
regressor_dt = DecisionTreeRegressor(random_state=0, max_depth = 10)
regressor_dt.fit(X_train,y_train)

# Random Forests
regr_rf = RandomForestRegressor(max_depth=15, random_state=0,
                             n_estimators=25)
regr_rf.fit(X_train, y_train)

# KNN
neigh = KNeighborsRegressor(n_neighbors = 3, metric = 'minkowski', p = 1)
neigh.fit(X_train_scaled, y_train)

# SVR
svr_regr = SVR(gamma=0.0001, kernel = 'linear', C =10)
svr_regr.fit(X_train_scaled, y_train)

In [None]:
# Predict price using the models above

y_pred1 = regressor.predict(X_test_scaled)    # Linear regression
y_pred2 = regr.predict(X_test)                # XGBoost
y_pred3 = adaregr.predict(X_test)             # AdaBoost
y_pred4 = regressor_dt.predict(X_test)        # Decision Tree
y_pred5 = regr_rf.predict(X_test)             # Random Forests
y_pred6 = neigh.predict(X_test_scaled)        # KNN
y_pred7 = svr_regr.predict(X_test_scaled)     # SVR

In [None]:
# Find R Squared (R^2) Values
print('The R^2 value for Linear Regression is        :', round((r2_score(y_test,y_pred1)), 3))
print('The R^2 value for XGBoost Regressor is        :', round((r2_score(y_test,y_pred2)), 3))
print('The R^2 value for AdaBoost Regressor is       :', round((r2_score(y_test,y_pred3)), 3))
print('The R^2 value for Decision Tree Regressor is  :', round((r2_score(y_test,y_pred4)), 3))
print('The R^2 value for Random Forests Regressor is :', round((r2_score(y_test,y_pred5)), 3))
print('The R^2 value for KNN Regressor is            :', round((r2_score(y_test,y_pred6)), 3))
print('The R^2 value for SVM Regressor is            :', round((r2_score(y_test,y_pred7)), 3))

In [None]:
# Find adjusted R Squared (Adj R2) Values
print('The Adj R2 value for Linear Regression is        :', round((1 - (1-r2_score(y_test,y_pred1))*(len(y)-1)/(len(y)-X.shape[1]-1)),3))
print('The Adj R2 value for XGBoost Regressor is        :', round((1 - (1-r2_score(y_test,y_pred2))*(len(y)-1)/(len(y)-X.shape[1]-1)),3))
print('The Adj R2 value for AdaBoost Regressor is       :', round((1 - (1-r2_score(y_test,y_pred3))*(len(y)-1)/(len(y)-X.shape[1]-1)),3))
print('The Adj R2 value for Decision Tree Regressor is  :', round((1 - (1-r2_score(y_test,y_pred4))*(len(y)-1)/(len(y)-X.shape[1]-1)),3))
print('The Adj R2 value for Random Forests Regressor is :', round((1 - (1-r2_score(y_test,y_pred5))*(len(y)-1)/(len(y)-X.shape[1]-1)),3))
print('The Adj R2 value for KNN Regressor is            :', round((1 - (1-r2_score(y_test,y_pred6))*(len(y)-1)/(len(y)-X.shape[1]-1)),3))
print('The Adj R2 value for SVM Regressor is            :', round((1 - (1-r2_score(y_test,y_pred7))*(len(y)-1)/(len(y)-X.shape[1]-1)),3))

In [None]:
# Find Mean Squared Errors (MSE) and Root Mean Squared Errors (RMSE)
MSE_lr  = mean_squared_error(y_test,y_pred1)
MSE_xgb = mean_squared_error(y_test,y_pred2)
MSE_ada = mean_squared_error(y_test,y_pred3)
MSE_dt  = mean_squared_error(y_test,y_pred4)
MSE_rf  = mean_squared_error(y_test,y_pred5)
MSE_knn = mean_squared_error(y_test,y_pred6)
MSE_svr = mean_squared_error(y_test,y_pred7)
print('The RMSE value for Linear Regression is        :', round(np.sqrt(MSE_lr)));
print('The RMSE value for XGBoost Regressor is        :', round(np.sqrt(MSE_xgb)));
print('The RMSE value for AdaBoost Regressor is       :', round(np.sqrt(MSE_ada)));
print('The RMSE value for Decision Tree Regressor is  :', round(np.sqrt(MSE_dt)));
print('The RMSE value for Random Forests Regressor is :', round(np.sqrt(MSE_rf)));
print('The RMSE value for KNN Regressor is            :', round(np.sqrt(MSE_knn)));
print('The RMSE value for SVM Regressor is            :', round(np.sqrt(MSE_svr)));

In [None]:
Plot the actual vs predicted prices

In [None]:
plt.scatter(y_test, y_pred1)
plt.xlabel("Actual price")
plt.ylabel("Predicted price")
plt.title("Linear Regression")

In [None]:
plt.scatter(y_test, y_pred2)
plt.xlabel("Actual price")
plt.ylabel("Predicted price")
plt.title("Xgboost")

In [None]:
plt.scatter(y_test, y_pred3)
plt.xlabel("Actual price")
plt.ylabel("Predicted price")
plt.title("Adaboost")

In [None]:
plt.scatter(y_test, y_pred4)
plt.xlabel("Actual price")
plt.ylabel("Predicted price")
plt.title("Decision Tree")

In [None]:
plt.scatter(y_test, y_pred5)
plt.xlabel("Actual price")
plt.ylabel("Predicted price")
plt.title("Random Forest")

In [None]:
plt.scatter(y_test, y_pred6)
plt.xlabel("Actual price")
plt.ylabel("Predicted price")
plt.title("KNN")

In [None]:
plt.scatter(y_test, y_pred7)
plt.xlabel("Actual price")
plt.ylabel("Predicted price")
plt.title("SVM")