In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
df = pd.read_csv("/kaggle/input/insurance/insurance.csv")

In [None]:
df.head()

That's great, we don't have any null in our data

## EDA

Let's check the distribution of age


<img src="https://media2.giphy.com/media/8JrcyXvpOaFbFIatkm/giphy.gif?cid=ecf05e470s9jkirkdnrg7dmysp4jotghq8hkkrfu6niiword&rid=giphy.gif&ct=g">

In [None]:
f= plt.figure(figsize=(12,4))
f, ax = plt.subplots(1,1, figsize=(12, 8))
ax = sns.distplot(df['age'], kde = True, color = 'r')
plt.title('Distribution of Age')

In [None]:
f= plt.figure(figsize=(12,4))
f, ax = plt.subplots(1,1, figsize=(12, 8))
ax = sns.distplot(df['charges'], kde = True, color = 'c')
plt.title('Distribution of Insurance Charges')

Here is the distribution of our dependent variable "y" and we have a right-skewed distribution. To make it closer to normal we can apply natural log.

In [None]:
f, ax = plt.subplots(1, 1, figsize=(12, 8))
ax = sns.distplot(np.log10(df['charges']), kde = True, color = 'r' )

In [None]:
charges = df['charges'].groupby(df.region).sum().sort_values(ascending = True)
f, ax = plt.subplots(1, 1, figsize=(8, 6))
ax = sns.barplot(charges.head(), charges.head().index, palette='Blues')

So Southeast has the highest medical charges  and Southwest has the lowest. Taking into account certain factors (sex, smoking, having children) let's see how it changes by region

In [None]:
f, ax = plt.subplots(1, 1, figsize=(12, 8))
ax = sns.barplot(x='region', y='charges', hue='sex', data=df, palette='cool')

Let's check the Smoker data.    Smokers are paying high charges 


<img src="https://64.media.tumblr.com/30e4148d71908c9d41c5888808e8b00b/tumblr_mzoql331JV1tohycao1_400.gifv">

In [None]:
f, ax = plt.subplots(1,1, figsize=(12,8))
ax = sns.barplot(x = 'region', y = 'charges',
                 hue='smoker', data=df, palette='Reds_r')

In [None]:
f, ax = plt.subplots(1, 1, figsize=(12, 8))
ax = sns.barplot(x='region', y='charges', hue='children', data=df, palette='Set1')

As we can see from these barplots the highest charges due to smoking are still in the Southeast but the lowest are in the Northeast. People in the Southwest generally smoke more than people in the Northeast, but people in the Northeast have higher charges by gender than in the Southwest and Northwest overall. And people with children tend to have higher medical costs overall as well

Now let's analyze the medical charges by age, bmi and children according to the smoking factor


In [None]:
ax = sns.lmplot(x = 'age', y = 'charges', data=df, hue='smoker', palette='Set1')
ax = sns.lmplot(x = 'bmi', y = 'charges', data=df, hue='smoker', palette='Set2')
ax = sns.lmplot(x = 'children', y = 'charges', data=df, hue='smoker', palette='Set3')

Smoking has the highest impact on medical costs, even though the costs are growing with age, bmi and children. Also people who have children generally smoke less, which the following violinplots shows too

In [None]:
f, ax = plt.subplots(1, 1, figsize=(10, 10))
ax = sns.heatmap(df.corr(), annot=True, cmap='cool')

# Feature scaling

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [None]:
ct = make_column_transformer((
    StandardScaler(),["age","bmi","children"]),(OneHotEncoder(handle_unknown="ignore"),["sex","smoker","region"])
)

In [None]:
X = df.drop("charges",axis=1)
y = df["charges"]

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
ct.fit(X_train)

In [None]:
X_train_normal = ct.transform(X_train)
X_test_normal = ct.transform(X_test)

To Solve this Regression problem we have multiple model like .. 

LinearRegression, 
Ridge, 
Lasso,
DecisionTreeRegressor,
KNeighborsRegressor,
SVR,
random_forest, 
AdaBoostRegressor, 
XGBRegressor,
LGBMRegressor,

And we have deep learning models also ... How to decide best among them ....


<img src="https://64.media.tumblr.com/3d343789f4fea02d39f249fa9c3703f0/tumblr_n156u4CNjE1tstqaho1_500.gifv">

We will take help of grid search

In [None]:
model_param = {
    'LinearRegression':{
        'model' : LinearRegression(),
        'param' : {}
    },
    'Ridge':{
        'model' : Ridge(),
        'param' : {
            'alpha': [0.5,0.75]
        }
    },
    'Lasso':{
        'model' : Lasso(fit_intercept=True, normalize=False, precompute=False,warm_start=False, positive=False, random_state=None, selection='cyclic'),
        'param' : {
            'alpha': [0.2, 0.3,0.5],
            'tol':[0.0001,0.001, 0.01, 0.1]
        }
    },
    'DecisionTreeRegressor':{
        'model':DecisionTreeRegressor(),
        'param':{
            'criterion': ['mse', 'friedman_mse', 'mae']
        }
    },
    'KNeighborsRegressor':{
        'model': KNeighborsRegressor(),
        'param':{
            'n_neighbors':[5,10,15,20,25]
        }
    },
    'SVR':{
        'model': SVR(),
        'param': {
            'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
            'C':[1,5,10,20,30,50,80,100]
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'param' : {
            'criterion': ['mse','mae'],
            'n_estimators': [1000,1200,1500,2000,3000]
        }
    },
    'AdaBoostRegressor':{
        'model': AdaBoostRegressor(),
        'param':{
            "n_estimators": [1, 10,100],
            'learning_rate':[.001,0.01,.1]
        }
    },
    'XGBRegressor':{
        'model' : XGBRegressor(),
        'param':{
            'booster': ['gbtree','dart'],
            'gamma': [0.5, 1, 1.5, 2, 5],
            'max_depth': [3, 4, 5]
        }
    },
    'lgb':{
        'model':LGBMRegressor(),
        'param':{'learning_rate': [0.01], 'n_estimators': [8, 24],
                   'num_leaves': [6, 8, 12, 16], 'boosting_type': ['gbdt'], 
                   'objective': ['binary'], 'seed': [500],
                   'colsample_bytree': [0.65, 0.75, 0.8], 
                   'subsample': [0.7, 0.75], 'reg_alpha': [1, 2, 6],
                   'reg_lambda': [1, 2, 6]
                 }
    }
}

In [None]:
import warnings
warnings.filterwarnings('ignore')

scores =[]
for model_name, mp in model_param.items():
    model_selection = GridSearchCV(estimator=mp['model'],param_grid=mp['param'],cv=5,return_train_score=False)
    model_selection.fit(X_train_normal,y_train)
    scores.append({
        'model': model_name,
        'best_score': model_selection.best_score_,
        'best_params': model_selection.best_params_
    })

In [None]:
df_model_score = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_model_score

In [None]:
AdaBoostRegressor = AdaBoostRegressor(learning_rate= 0.001, n_estimators= 100)

In [None]:
AdaBoostRegressor.fit(X_train_normal,y_train)

In [None]:
y_pred_ada = AdaBoostRegressor.predict(X_test_normal)

In [None]:
sns.distplot(y_pred_ada-y_test)

In [None]:
Abd_mae = mean_absolute_error(y_test,y_pred_ada)
Abd_mae

### Appling deep earning approach

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
tf.random.set_seed(42)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(10),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.mae,
             optimizer=tf.keras.optimizers.Adam(),
             metrics=['mae'])



In [None]:
history = model.fit(X_train_normal,y_train,epochs=200)

In [None]:
pd.DataFrame(history.history).plot()
plt.ylabel("loss")
plt.xlabel("epochs")
plt.show()

In [None]:
model_loss,model_mae = model.evaluate(X_test_normal,y_test)
model_loss,model_mae

In [None]:
y_pred_tensor = model.predict(X_test_normal)

In [None]:
y_test_tensor = np.asarray(y_test)

In [None]:
sns.distplot(y_pred_tensor-y_test_tensor)

### AdaBoostRegressor wins with MAE of 2897