In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('../input/insurance/insurance.csv')
df.head()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(df.corr(),annot=True,fmt=".2f")
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(df.describe().T,annot=True,fmt=".2f")
plt.show()

In [None]:
df.info()

In [None]:
df.isnull().sum().sum()

In [None]:
dff=df.copy()

In [None]:
from sklearn import preprocessing

In [None]:
lbe=preprocessing.LabelEncoder()

In [None]:
dff['sex']=lbe.fit_transform(dff['sex'])
dff['smoker']=lbe.fit_transform(dff['smoker'])
dff['region']=lbe.fit_transform(dff['region'])

In [None]:
dff.head()

In [None]:
dff.sex.value_counts()

In [None]:
dff.smoker.value_counts()

In [None]:
dff.region.value_counts()

In [None]:
dff.head()

In [None]:
x=dff.drop(['charges'],axis=1)
y=dff['charges']

In [None]:
x.head()

In [None]:
y[0:5]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,
                                              y,
                                              test_size=0.25,
                                              random_state=42)

In [None]:
print('x_train shape:',x_train.shape)
print('x_test shape:',x_test.shape)
print('y_train shape:',y_train.shape)
print('y_test shape:',y_test.shape)

## Gradient Boosting Machines:

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
#model:
gbm_model=GradientBoostingRegressor()
gbm_model.fit(x_train,y_train)

In [None]:
#predict:
y_pred=gbm_model.predict(x_test)

In [None]:
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
print(np.sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test,y_pred))

## cv

In [None]:
gbm_params={
'learning_rate':[0.001,0.01],
'max_depth':[3,5,8],
'n_estimators':[50,100],
'subsample':[1,0.5]
}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
gbm=GradientBoostingRegressor()

In [None]:
gbm_cv=GridSearchCV(gbm,
                    gbm_params,
                    cv=10,
                    verbose=2).fit(x_train,y_train)

In [None]:
#best parameters:
gbm_cv.best_params_

In [None]:
#final model:
gbm_tuned_model=GradientBoostingRegressor(learning_rate=0.01,
                                         max_depth=3,
                                         n_estimators=100,
                                         subsample=0.5).fit(x_train,y_train)

In [None]:
y_pred=gbm_tuned_model.predict(x_test)

In [None]:
print(np.sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test,y_pred))

## Extreme Gradient Boosting:

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor

In [None]:
#model:
xgb_model=XGBRegressor().fit(x_train,y_train)  #model
y_pred=xgb_model.predict(x_test)                #predict
print(np.sqrt(mean_squared_error(y_test,y_pred)))    #error
print(r2_score(y_test,y_pred))                      #score

In [None]:
xgb_params={'colsample_bytree':[0.4,0.5],
          'n_estimators':[50,100],
          'max_depth':[2,3],
          'learning_rate':[0.1,0.5]}

In [None]:
xgb_cv_model=GridSearchCV(xgb_model,
                         xgb_params,
                         cv=10,
                         verbose=2).fit(x_train,y_train)

In [None]:
xgb_cv_model.best_params_

In [None]:
#final model:
xgb_tuned_model=XGBRegressor(colsample_bytree=0.5,
                            learning_rate=0.5,
                            max_depth=3,
                            n_estimators=100).fit(x_train,y_train)

In [None]:
print(np.sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test,y_pred))

## CatBoost:

In [None]:
from catboost import CatBoostRegressor

In [None]:
catb=CatBoostRegressor()
catb_model=catb.fit(x_train,y_train)

In [None]:
y_pred=catb_model.predict(x_test)

In [None]:
print(np.sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test,y_pred))

In [None]:
#model tuning:
catb_params={
    'iterations':[40,50],
    'learning_rate':[0.1,0.2],
    'depth':[3,4,5]
}

In [None]:
catb=CatBoostRegressor()

In [None]:
catb_cv_model=GridSearchCV(catb,
                           catb_params,
                           cv=5,
                           n_jobs=-1,
                           verbose=2).fit(x_train,y_train)

In [None]:
catb_cv_model.best_params_

In [None]:
#final model:
catb_tuned_model=CatBoostRegressor(depth=4,
                                  iterations=50,
                                  learning_rate=0.1).fit(x_train,y_train)

In [None]:
y_pred=catb_tuned_model.predict(x_test)

In [None]:
print(np.sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test,y_pred))