In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataset = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
#Viewing the data
dataset.head()

In [None]:
#Information about the data
dataset.info()

In [None]:
#Check Null values
dataset.isnull().sum()

In [None]:
# Create matrices of independent and dependent features
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [None]:
X


In [None]:
y

# **DATA PREPROCESSING**

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

**Sex**

In [None]:
#Perform One Hot Encoding
columnTransformer = ColumnTransformer([('sex_encoder',OneHotEncoder(),[1])], remainder='passthrough')
X = columnTransformer.fit_transform(X)

In [None]:
X


In [None]:
#Avoid Dummy Var Trap
X = X[:,1:] #Keep only 1 sex column

In [None]:
X


**Smoker**

In [None]:
#Perform One Hot Encoding
columnTransformer = ColumnTransformer([('smoker_encoder',OneHotEncoder(),[4])], remainder='passthrough')
X = columnTransformer.fit_transform(X)

In [None]:
X

In [None]:
#Avoid Dummy Var Trap
X = X[:,1:] #Keep only 1 smoker column

In [None]:
X[0].size

In [None]:
X[0]

**Region**

In [None]:
#Perform One Hot Encoding
columnTransformer = ColumnTransformer([('region_encoder',OneHotEncoder(),[5])], remainder='passthrough')
X = columnTransformer.fit_transform(X)

In [None]:
#Avoid Dummy Var Trap
X = X[:,1:] 


In [None]:
X[0].size

# **BACKWARD ELIMINATION**

We will select the independent features that are essential for prediction of dependent variable y
This will involve looking at p-values and the Adj R-squared values
SL = 0.05

In [None]:
import statsmodels.api as sm

**We need to cater for the constant varibale of the equation**
y = mx + c

So here we append a constant b = 1 to c so that the value is preserved and impacts the predicted value

In [None]:
X = np.append(arr = np.ones((1338,1)).astype(int),values = X, axis = 1)

In [None]:
X[0]

**Initial OLS values**

In [None]:
X = np.array(X,dtype=float)
regressor_OLS = sm.OLS(y,X).fit()
regressor_OLS.summary()

# **Elimination 1**
**We see x3 is max & x3 > SL**
Hence, we remove x3 and repeat the process

In [None]:
X = np.delete(X,3,1)

**New OLS Vaues**

In [None]:
X = np.array(X,dtype=float)
regressor_OLS = sm.OLS(y,X).fit()
regressor_OLS.summary()

# **Elimination 2**
**We see x5 is max & x5 > SL Hence, we remove x3 and repeat the process
However pvalue(x5) is not much greater than SL
Hence, we need to see its impact on the Adj R-squared value**

In [None]:
X = np.delete(X,4,1)

In [None]:
X


**New OLS Values**

In [None]:
X = np.array(X,dtype=float)
regressor_OLS = sm.OLS(y,X).fit()
regressor_OLS.summary()

# **Elimination 3**
**We see x3 is max & x3 > SL**
Hence, we remove x3 and repeat the proces

In [None]:
X = np.delete(X,3,1)

In [None]:
X

**New OLS Values**

In [None]:
X = np.array(X,dtype=float)
regressor_OLS = sm.OLS(y,X).fit()
regressor_OLS.summary()

# **TEST TRAIN SPLIT**

In [None]:
from sklearn.model_selection import train_test_split 

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25, random_state = 0)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

# **FEATURE SCALING**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc_X = StandardScaler()

In [None]:
X_train = sc_X.fit_transform(X_train)

In [None]:
X_train

In [None]:
X_test = sc_X.transform(X_test)

In [None]:
X_test

In [None]:
sc_y = StandardScaler()

In [None]:
y_train = y_train.reshape(-1,1)
y_train = sc_y.fit_transform(y_train)

In [None]:
y_train

In [None]:
y_test = y_test.reshape(-1,1)
y_test = sc_y.transform(y_test)

In [None]:
y_test

# **LINEAR REGRESSION**

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_regressor = LinearRegression()

In [None]:
linear_regressor.fit(X_train,y_train)

In [None]:
y_linear_pred = linear_regressor.predict(X_test)

In [None]:
y_linear_pred

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
r2_score(y_test, y_linear_pred)

In [None]:
mean_squared_error(y_test,y_linear_pred)

# **SVR**

In [None]:
from sklearn.svm import SVR

In [None]:
svr_regressor = SVR(kernel='linear')

In [None]:
svr_regressor.fit(X_train,y_train.ravel())

In [None]:
y_svr_pred = svr_regressor.predict(X_test)

In [None]:
y_svr_pred

In [None]:
r2_score(y_test, y_svr_pred)

In [None]:
mean_squared_error(y_test,y_svr_pred)

# **RANDOM FOREST**

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_regressor = RandomForestRegressor(n_estimators = 300, random_state = 0) 

In [None]:
rf_regressor.fit(X_train,y_train)

In [None]:
y_rf_pred = rf_regressor.predict(X_test)

In [None]:
y_rf_pred

In [None]:
r2_score(y_test, y_rf_pred)

In [None]:
mean_squared_error(y_test,y_rf_pred)

In [None]:
#Perform One Hot Encoding
columnTransformer = ColumnTransformer([('smoker_encoder',OneHotEncoder(),[4])], remainder='passthrough')
X = columnTransformer.fit_transform(X)

**Sex**

In [None]:
#One Hot Encode the col