# Import Required Libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
from scipy.stats import shapiro

# Load Data

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')
df.head()

# Get Data Information

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.describe(include='O').T

# Check null Values

In [None]:
df.isnull().sum()

There is no null values in our dataset that means our dataset is clean

# Data Analysis

# Check the data distribution using Shapiro Test

In [None]:
def col_dist(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            d, p = shapiro(frame[i])
            if(p>0.05):
                print(i, 'is normally distributed')
            else:
                print(i, 'is not normally distributed')
                
        if(frame[i].dtype == 'float64'):
            d, p = shapiro(frame[i])
            if(p>0.05):
                print(i, 'is normally distributed')
            else:
                print(i, 'is not normally distributed')

In [None]:
col_dist(df)

# Univariant Analysis

In [None]:
def univariant(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            sns.distplot(frame[i])
            plt.show()
                
        elif(frame[i].dtype == 'float64'):
            sns.distplot(frame[i])
            plt.show()
            
        elif(frame[i].dtype == 'O'):
            sns.countplot(frame[i])
            plt.show()
            
univariant(df)

# Bivariant Analysis

In [None]:
def bivariant(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            sns.jointplot(df['charges'], frame[i])
            plt.show()
            
        elif(frame[i].dtype == 'float64'):
            sns.jointplot(df['charges'], frame[i])
            plt.show()
            
bivariant(df)

# Detecting Outliers

In [None]:
def det_outlier(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            sns.boxplot(frame[i])
            plt.show()
            
        elif(frame[i].dtype == 'float64'):
            sns.boxplot(frame[i])
            plt.show()
            
det_outlier(df)

# Encode Categorical Variables

In [None]:
def encode(dataframe):
    lec = LabelEncoder()
    for j in dataframe.columns:
        if(dataframe[j].dtype == 'object'):
            dataframe[j] = lec.fit_transform(dataframe[j])
            
encode(df)

# Split Data Into Train and Test

In [None]:
x = df.drop('charges', axis=1)
y = df['charges']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

# Convert charges into log

In [None]:
new_y = y.copy()
new_y = np.log(y)
Xl_train, Xl_test, Yl_train, Yl_test = train_test_split(x, new_y, test_size = 0.3, random_state = 1)

# Lets Build Models

# 1. Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, Y_train)

In [None]:
lr_pred = lr.predict(X_test)
mean_squared_error(Y_test, lr_pred, squared=False)

# 2. Random Forest Regressor

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, Y_train)

In [None]:
rf_pred = rf.predict(X_test)
mean_squared_error(Y_test, rf_pred, squared=False)

# 3. Ada Boost Regressor

In [None]:
ada = AdaBoostRegressor()
ada.fit(X_train, Y_train)

In [None]:
ada_pred = ada.predict(X_test)
mean_squared_error(Y_test, ada_pred, squared=False)

# 4. XGBoost Regressor

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train, Y_train)

In [None]:
xgb_pred = xgb.predict(X_test)
mean_squared_error(Y_test, xgb_pred, squared=False)

# 5. Gradient Boosting Regressor

In [None]:
gb = GradientBoostingRegressor()
gb.fit(X_train, Y_train)

In [None]:
gb_pred = gb.predict(X_test)
mean_squared_error(Y_test, gb_pred, squared=False)

# After Hyper Parameter Tuning

In [None]:
hgb = GradientBoostingRegressor(learning_rate=0.03, max_depth=2, max_features='auto',
                          n_estimators=300)
hgb.fit(X_train, Y_train)

In [None]:
hgb_pred = hgb.predict(X_test)
mean_squared_error(Y_test, hgb_pred, squared=False)

After hyper parameter tuning my rmse increased so i will not consider hyperparamer tuning

# After Feature Selection

In [None]:
fgb = GradientBoostingRegressor()
fgb.fit(X_train, Y_train)

In [None]:
fgb_pred = fgb.predict(X_test)
mean_squared_error(Y_test, fgb_pred, squared=False)

# After log transformation

In [None]:
lgb = GradientBoostingRegressor()
lgb.fit(Xl_train, Yl_train)

In [None]:
lgb_pred = lgb.predict(Xl_test)
mean_squared_error(Yl_test, lgb_pred, squared=False)

After log transformation my rmse decreased so i will consider my gradient bossting regressor after log transformation

# 6. Stacking Regressor

In [None]:
stack = StackingRegressor([
    ('rf', RandomForestRegressor()),
    ('xgb', XGBRegressor()),
    ('gb', GradientBoostingRegressor())
])
stack.fit(X_train, Y_train)

In [None]:
stack_pred = stack.predict(X_test)
mean_squared_error(Y_test, stack_pred, squared=False)

# Hyper Parameter Tuning

I am using hyperparameter tuning on gradient boosting regressor because it gives me best rmse 

In [None]:
grid = {
    'learning_rate' : [0.03, 0.04, 0.05],
    'n_estimators' : [300, 500, 700, 900, 1100],
    'criterion' : ['friedman_mse', 'mse', 'mae'],
    'max_depth' : [2, 3, 4],
    'max_features' : ['auto', 'sqrt', 'log2']
}

In [None]:
random_cv = RandomizedSearchCV(estimator=gb,
                              param_distributions=grid,
                              verbose=7,
                              n_iter=50,
                              n_jobs=-1,
                              random_state=30,
                              scoring='neg_mean_squared_error',
                              cv=5)

random_cv.fit(X_train, Y_train)

In [None]:
random_cv.best_estimator_

# Feature Selection

# Using Select From Model

In [None]:
th = np.sort(gb.feature_importances_)
for g in th:
    select = SelectFromModel(gb, threshold = g, prefit = True)
    x_train = select.transform(X_train)
    model = GradientBoostingRegressor()
    model.fit(x_train, Y_train)
    x_test = select.transform(X_test)
    y_pred = model.predict(x_test)
    score = mean_squared_error(Y_test, y_pred, squared=False)
    print('Threshold:', g, 'Model Score:', score)

In [None]:
imp = pd.DataFrame(gb.feature_importances_)
imp.index = X_train.columns
imp[imp[0] < 0.0031225473178464716]

In [None]:
# Creating new dataframe to train model after feature selection
new_df = df.copy()
new_df = df.drop('sex', axis=1)