In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from scipy.stats import norm, boxcox
from scipy import stats
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset=pd.read_csv("/kaggle/input/insurance/insurance.csv",header=0)
dataset.describe()

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
#checking do age and charges have any linear relation
import matplotlib.pyplot as plt
plt.scatter(X[:,0:1], y, color = 'red')
plt.xlabel('age')
plt.ylabel('Charges')
plt.show()

In [None]:
dataset[["sex","charges"]].groupby(["sex"], as_index = False).mean().sort_values(by = "charges",ascending = False).style.background_gradient("Greens")

We see that men pay more than women.

In [None]:
dataset[["children","charges"]].groupby(["children"], as_index = False).mean().sort_values(by = "charges",ascending = False).style.background_gradient("Greens")

In [None]:
dataset[["smoker","charges"]].groupby(["smoker"], as_index = False).mean().sort_values(by = "charges",ascending = False).style.background_gradient("Greens")

In [None]:
dataset[["region","charges"]].groupby(["region"], as_index = False).mean().sort_values(by = "charges",ascending = False).style.background_gradient("Greens")

In [None]:
region = dataset.groupby("region", as_index=False)["age","bmi","children","charges"].mean().sort_values("age",ascending=False).style.background_gradient("Blues")
print("Average value of other properties by region \n")
region

In [None]:
sns.distplot(dataset["age"], fit=norm)
plt.title("Age Distplot", color = "darkred")

In [None]:
sns.distplot(dataset["charges"], fit=norm)
plt.title("charges Distplot", color = "darkred")

skewness can be seen here. so we need to normalize this.

In [None]:
#Any missing values
dataset.isnull().sum()

In [None]:
# Encoding categorical data
# Encoding the Independent Variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#print(X[0:4,:])
ct = ColumnTransformer(transformers=[('encodersex', OneHotEncoder(), [1]),
                                     ('encoderchildren', OneHotEncoder(), [3]),
                                     ('encodersmoker', OneHotEncoder(), [4]),
                                     ('encoderregion', OneHotEncoder(), [5])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
#print(X[0:4,:])

In [None]:
#To handle skweness of charges:
y = np.log1p(y)

In [None]:
#checking top 5 values
print(X[0:5])
print(y[0:5])

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

***Multiple Linear Regression model***

In [None]:
# Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred=regressor.predict(X_test)
from sklearn import metrics
score_rf=metrics.r2_score(y_test, y_pred)
print(score_rf)

***Random Forest Regression model***

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

rf_param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = rf_param_grid, n_iter = 100, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
y_pred = rf_random.predict(X_test)

In [None]:
print(rf_random.best_params_)
score_rf=metrics.r2_score(y_test, y_pred)

In [None]:
print("r_square score --> ",score_rf)
print('Mean Absolute Error -->', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error -->', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error -->', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
from yellowbrick.regressor import PredictionError
visualizer = PredictionError(rf_random)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)        
visualizer.show();

In [None]:
# Predicting the Test set results
#np.set_printoptions(precision=2)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

***SVM Regressor***

In [None]:
# Training the SVR model on the Training set
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Evaluating the Model Performance
metrics.r2_score(y_test, y_pred)

In [None]:
from yellowbrick.regressor import PredictionError
visualizer = PredictionError(regressor)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)        
visualizer.show();

***KNeighborsRegressor***

In [None]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=5)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)

# Evaluating the Model Performance
metrics.r2_score(y_test, y_pred)

In [None]:
from yellowbrick.regressor import PredictionError
visualizer = PredictionError(neigh)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)        
visualizer.show();

***GradientBoostingRegressor***

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor(max_depth=3,random_state=20)
reg.fit(X_train, y_train)
y_pred=reg.predict(X_test)
metrics.r2_score(y_test, y_pred)

In [None]:
from yellowbrick.regressor import PredictionError
visualizer = PredictionError(reg)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)        
visualizer.show();

In [None]:
import xgboost as xgb

model = xgb.XGBRegressor(importance_type='gain', learning_rate=0.1)
model.fit(X_train, y_train,verbose=True)
 
score = model.score(X_train, y_train)   
print("Training score: ", score) 
score = model.score(X_test, y_test)   
print("test score: ", score)
y_pred=model.predict(X_test)
score = metrics.r2_score(y_test, y_pred)
print("r2_score : ", score)