In [None]:
import pandas as pd
import numpy as np
import io
import math
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score,make_scorer, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV ,GridSearchCV
from sklearn.svm import SVR

In [None]:
df= pd.read_csv('../input/insurance-premium-prediction/insurance.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
#check for outliers using IQR method
def outlier(col):
    m= df[col].mean()
    m1= df[col].min()
    m2=df[col].max()
    l= len(df)
    q1= df[col].quantile(0.25)
    q3=df[col].quantile(0.75)
    IQR= q3-q1
    lower= q1-(IQR*1.5)
    upper= q3+(IQR*1.5)
    n= len(df.loc[np.where((df[col] > upper) | (df[col] < lower))])
    perc= (n/l)*100
    print(f'{col}\n percentage= {perc}\n number={n}\n mean= {m}\n min={m1}\n max={m2}')

In [None]:
for i in df.select_dtypes(exclude='object').columns:
  outlier(i)

In [None]:
#using box plot to understand the outliers visually and to understand the relation between the smokers and other features on expenses
sns.set(font_scale=1.2)
plt.figure(figsize=(25,25))

for i, column in enumerate(['sex','region','children'], 1):
    plt.subplot(2,2, i)
    g = sns.boxplot(x=f"{column}", y='expenses',hue='smoker',data=df)
    g.set_xticklabels(g.get_xticklabels())
    plt.ylabel('expenses')
    plt.xlabel(f'{column}')

In [None]:
(df[df['region']=='southeast']).sort_values(by='expenses',ascending=False)

In [None]:
(df[df['region']=='southwest']).sort_values(by='expenses',ascending=False)

Observation:

The above exhibits that the region a person is located is playing a role on the expense one bears along with the smoking habit

In [None]:
#Analysis of Variance
(df.var()).plot(kind='bar',ylim= (0,2))

In [None]:
sns.pairplot(df,hue='sex')

In [None]:
plt.figure(figsize=(15,15))
sns.scatterplot(x='age',y='expenses',hue='smoker',data=df)

Observation:
Smokers overall have more expense than non-smokers and also it is has a linear relationship with age i.e. as age inrease expense increase

In [None]:
#manully encoding the smoker feature to 0 and 1
df["smoker"] = df["smoker"].replace({"yes":1,"no":0})

In [None]:
#one-hot encoding the categorical features 
df_dummy= pd.get_dummies(df)

In [None]:
df_dummy.head()

In [None]:
#checking the correlation among the features
plt.figure(figsize=(15,10))
sns.heatmap(df_dummy.corr(),annot=True)

Onservation:
- no probable multicolineraity
- smoker feature has maximum correlation to the target

In [None]:
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm

In [None]:
#divided the dataset to dependent and independent variables
x= df_dummy.drop(columns='expenses')
y=df_dummy[['expenses']]

In [None]:
scale= StandardScaler()

In [None]:
#standard scaling the age and bmi feature as they are in different units
x[['age','bmi']]= scale.fit_transform(x[['age','bmi']])

In [None]:
#splitting the data into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)

#Polynomial Regression

In [None]:
#polynomial features created to introduce the linear relation between the dependent and independent variables
poly= PolynomialFeatures(degree=2)
x_poly= poly.fit_transform(x)
X_train1, X_test1, y_train1, y_tes1 = train_test_split(x_poly, y, test_size=0.3, random_state=101)

Observation:
Overfitting observed at a degree of more than 2 thus optimum degree choosen as 2

In [None]:
LR= LinearRegression(fit_intercept=True)
LR.fit(X_train1,y_train1)
y_pred_poly= LR.predict(X_train1)

In [None]:
print(f'MAE:{mean_absolute_error(y_train1,y_pred_poly)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_train1,y_pred_poly))}')

In [None]:
y_pred_poly_t= LR.predict(X_test1)
print(f'MAE:{mean_absolute_error(y_tes1,y_pred_poly_t)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_tes1,y_pred_poly_t))}')

#HyperParameter Tuning of Non-Linear Models using R2_score

In [None]:
RFR= RandomForestRegressor()
params_RFR= {'n_estimators':[100,500,1000],
                'max_features':['log2', 'sqrt'],
                'max_depth':[2, 3, 5],
                'min_samples_split':[2,5,10,15,20,25],
                'min_samples_leaf':[1,2,4,6,8,10]}

KNNR= KNeighborsRegressor()
params_KNNR= {'n_neighbors':list(range(2,20,2)),
              'weights':['uniform', 'distance'],
              'leaf_size':list(range(1,5)),
              'p':[1,2]}

DTR=DecisionTreeRegressor()
params_DTR= {  'max_depth':[2, 3, 5],
                'min_samples_split':[2,5,10,15,20,25]}

GBR=GradientBoostingRegressor()
params_GBR= {'n_estimators':[10, 50, 100, 500],
             'max_depth':[2, 3, 5,7],
              'min_samples_split':[2,5,10,15,20,25],
              'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0],
               'min_samples_leaf':[1,2,4,6,8,10],
              'max_features':['log2', 'sqrt']}

Svr= SVR()
params_SVR= {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
             'degree':[1,2,3,4,5,6,7],
             'gamma':['scale', 'auto'],
             'C':[1, 10, 100, 1000, 10000]}

In [None]:
for i,j,k in zip([Svr,DTR,RFR,KNNR,GBR],[params_SVR,params_DTR,params_RFR,params_KNNR,params_GBR],['SVR','DTR','RFR','KNNR','GBR']):
  grid= GridSearchCV(estimator=i, param_grid=j,cv=5,n_jobs=-1,scoring=make_scorer(r2_score))
  grid.fit(X_train,y_train)
  print(f'{k}= ')
  print('-'.center(20,'-'))
  print(f'r2 score:{grid.best_score_}')
  print(f'Best Parameter=')
  print(grid.best_params_)
  print('xxx'.center(100,'-'))
  print('xxx'.center(100,'-'))

In [None]:
reg= GradientBoostingRegressor(learning_rate= 0.1, max_depth=3, max_features= 'sqrt',min_samples_leaf= 4,min_samples_split=15, n_estimators=100)

In [None]:
reg.fit(X_train,y_train)

In [None]:
y_pred= reg.predict(X_train)

In [None]:
print(f'RMSE: {np.sqrt(mean_squared_error(y_train,y_pred))}')
print(f'MAE: {(mean_absolute_error(y_train,y_pred))}')

In [None]:
y_pred1= reg.predict(X_test)
print(f'RMSE: {np.sqrt(mean_squared_error(y_test,y_pred1))}')
print(f'MAE: {(mean_absolute_error(y_test,y_pred1))}')

Conclusion:
It was found that the GradientBoosting Regressor is giving a better model as compare to other models for this dataset using the RMSE and MAE scoring metrices.