In [None]:
#pip install ngboost
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#from ngboost import NGBRegressor
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, LassoCV, ElasticNetCV, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, SCORERS
import matplotlib.pyplot as plt
import xgboost as xgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
insurance = pd.read_csv('/kaggle/input/insurance/insurance.csv')

In [None]:
insurance.describe()

In [None]:
insurance.head(5)

In [None]:
sns.pairplot(insurance)

In [None]:
sns.boxplot(insurance['charges'])

In [None]:
insurance = insurance[insurance['charges']<insurance['charges'].quantile(0.90)] # excluding outliers

In [None]:
sns.distplot(insurance['charges'])

In [None]:
insurance.info()

In [None]:
categories = [i for i in insurance.select_dtypes("object")]
variables= [i for i in insurance.select_dtypes(exclude="object")]

# Exploratory Data Analysis

In [None]:
f, axes = plt.subplots(len(categories), len(variables), figsize=(len(categories) * 8, len(variables) * 4))

for n, i in enumerate(categories):
    for m, j in enumerate(variables):
        sns.scatterplot(x=j, y='charges', hue=i, data=insurance, ax=axes[n,m])
        axes[n,m].set_xticklabels(axes[n,m].get_xticklabels(), rotation=45)
        axes[n,m].set_title('Fig (' + str(m) + ',' + str(n) + '): ' + str(j).upper() + ' vs CHARGES scatterplot by ' + str(i).upper())
    



It can be seen that older people tend to pay more insurance charges (Fig(0,0), (0,1),(0,2)).
At every age or number of children Smokers are likely to higher insurance than non-smoker Fig(0,1).
For every BMI Smokers are likely to pay a higher insurance than non-smoker, the higher BMI the more the insurance charge difference for Smokers and Non-Smokers Fig(1,1)

In [None]:
for i in categories:
    insurance[i],_ = insurance[i].factorize()
X = insurance.drop(columns='charges')
y = insurance['charges']
feature_importance = pd.DataFrame(mutual_info_regression(X, y), index=X.columns, columns=['Scores']).sort_values(by=['Scores'], ascending=False)

In [None]:
feature_importance

In [None]:
sns.barplot(feature_importance.Scores, feature_importance.index,  color='lightblue')

In [None]:
LR = LinearRegression()
LCV =  LassoCV()
Rdg = Ridge()
base_elastic_net_model  = ElasticNet()
GBR = GradientBoostingRegressor()

The above shows that the most important factor to determine the insurance charge is the age and then smoker is the second. 


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)

In [None]:
SC = StandardScaler()
X_train = SC.fit_transform(X_train)
X_test = SC.transform(X_test)

# Model Using ElasticNet 

In [None]:
elasticnet_parameter_grid = {'alpha':[0.1,1.5, 10,50,100], 'l1_ratio':[0.1, 0.2, 0.5, 0.7, 0.95, 0.99]}
elasticnetgrid_model = GridSearchCV(estimator = base_elastic_net_model, param_grid=elasticnet_parameter_grid, scoring='neg_mean_squared_error', cv= 5
                         )
elasticnetgrid_model.fit(X_train, y_train)
pd.DataFrame(elasticnetgrid_model.cv_results_).sort_values(by=['rank_test_score']).head(1)

# Model Using Xgboost 

In [None]:
y_pred=elasticnetgrid_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred)) #RMSE

In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)

xgb_model.fit(X_train, y_train)
y_pred=xgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_pred, y_test))

In [None]:
sns.scatterplot(y_test, y_pred-y_test)
sns.lineplot(y_test,y=0, color='red')

# Model Using GBR 

In [None]:


GBR.fit(X_train, y_train)
y_pred=GBR.predict(X_test)
np.sqrt(mean_squared_error(y_pred, y_test)) #RMSE


In [None]:
sns.scatterplot(y_test, y_pred-y_test)
sns.lineplot(y_test,y=0, color='red')