In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:
# load the Health Insurance dataset 

HealthIns = pd.read_csv(r"C:\Users\Admin\Downloads\Raju Sir DLS\Regression Analysis Datasets\03-Health_Insurance.csv", header=0) 
HealthIns_BK = HealthIns.copy()
HealthIns.head()

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region,Expenses
0,19,2,27.9,0,1,4,16884.92
1,18,1,33.8,1,0,3,1725.55
2,28,1,33.0,3,0,3,4449.46
3,33,1,22.7,0,0,2,21984.47
4,32,1,28.9,0,0,2,3866.86


In [3]:
# Display the dataset information

HealthIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       1338 non-null   int64  
 1   Gender    1338 non-null   int64  
 2   BMI       1338 non-null   float64
 3   Children  1338 non-null   int64  
 4   Smoker    1338 non-null   int64  
 5   Region    1338 non-null   int64  
 6   Expenses  1338 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 73.3 KB


In [4]:
# Display the correlation

HealthIns.corr()

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region,Expenses
Age,1.0,0.020856,0.109341,0.042469,-0.025019,0.002127,0.299008
Gender,0.020856,1.0,-0.04638,-0.017163,-0.076185,-0.004588,-0.057292
BMI,0.109341,-0.04638,1.0,0.012645,0.003968,0.157439,0.198576
Children,0.042469,-0.017163,0.012645,1.0,0.007673,0.016569,0.067998
Smoker,-0.025019,-0.076185,0.003968,0.007673,1.0,-0.002181,0.787251
Region,0.002127,-0.004588,0.157439,0.016569,-0.002181,1.0,-0.006208
Expenses,0.299008,-0.057292,0.198576,0.067998,0.787251,-0.006208,1.0


In [5]:
# Identify the independent and Target (dependent) variables

IndepVar = []
for col in HealthIns.columns:
    if col != 'Expenses':
        IndepVar.append(col)

TargetVar = 'Expenses'

x = HealthIns[IndepVar]
y = HealthIns[TargetVar]

In [6]:
# Split the data into train and test (random sampling)

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [7]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

In [8]:
# Train the algorithm and build the model with train dataset

from sklearn.linear_model import LinearRegression

MulRGR = LinearRegression()

# Train the model with training dataset

MulRGR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = MulRGR.predict(x_test)

In [11]:
# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 4154.671
Mean Squared Error (MSE): 33829389.389
Root Mean Squared Error (RMSE): 5816.304
R2_score: 0.769278
Root Mean Squared Log Error (RMSLE): 8.668
Mean Absolute Percentage Error (MAPE): 44.199 %
Adj R Square:  0.768238


In [10]:
Results = pd.DataFrame({'Expenses_A':y_test, 'Expenses_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = HealthIns_BK.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(10)

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region,Expenses,Expenses_A,Expenses_P
427,18,2,29.2,0,0,1,7323.73,7323.73,2283.678093
755,31,1,27.6,2,0,1,5031.27,5031.27,6072.426496
777,45,1,39.8,0,0,1,7448.4,7448.4,13242.399301
486,54,2,21.5,3,0,2,12475.35,12475.35,9899.862554
678,56,1,36.1,3,0,4,12363.55,12363.55,15094.52061
309,41,2,33.1,2,0,2,7749.16,7749.16,10216.703424
693,24,1,23.7,0,0,2,2352.97,2352.97,1673.000748
1032,30,2,27.9,0,0,1,4137.52,4137.52,4958.782428
741,27,1,29.2,0,1,3,18246.5,18246.5,27716.151134
198,51,2,18.1,0,0,2,9644.25,9644.25,6626.876356


# Run and compare all regression algorithms

In [12]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

modelmlg = LinearRegression()
modeldcr = DecisionTreeRegressor()
modelrfr = RandomForestRegressor()
modelABR = AdaBoostRegressor(n_estimators=50, base_estimator=None ,learning_rate=1)
modelSVR = SVR()
modelGBR = GradientBoostingRegressor()
modelXGR = xgb.XGBRegressor()
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelETR = ExtraTreesRegressor()
modelMLPR = MLPRegressor()

# Evalution matrix for all the algorithms

MM = [modelmlg, modeldcr, modelrfr, modelABR, modelSVR, modelGBR, modelXGR, modelKNN, modelETR, modelMLPR]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 4154.671
Mean Squared Error (MSE): 33829389.389
Root Mean Squared Error (RMSE): 5816.304
R2_score: 0.769278
Root Mean Squared Log Error (RMSLE): 8.668
Mean Absolute Percentage Error (MAPE): 44.2 %
Adj R Square:  0.768238
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 3023.045
Mean Squared Error (MSE): 40210410.475
Root Mean Squared Error (RMSE): 6341.168
R2_score: 0.725759
Root Mean Squared Log Error (RMSLE): 8.755
Mean Absolute Percentage Error (MAPE): 36.26 %
Adj R Square:  0.724523
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 2564.416
Mean Squared Error (MSE): 21298749.685
Root Mean Squared Error (RMSE): 4615.057
R2_score: 0.854739
Root Mean Squared Log Error (RMSLE): 8.437
M