In [1]:
# Importing the libraries 

import pandas as pd   #for data analysis
import numpy as np #for mathematical analysis
import matplotlib.pyplot as plt ##for data visualization
import seaborn as sns      ##for data visualization
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:
# Load the bank data

insurence_data = pd.read_csv(r"C:\Users\srila\Downloads\ml-intern\test1\Health_Ins_Test01_V1.0 (1).csv", header=0)

# Copy to back-up files

insurencedata_bk = insurence_data.copy()

# Display first 5 records

insurence_data.head(10)

Unnamed: 0,Record_ID,Age,Gender,BMI,Children,Smoker,Region,Expenses
0,QK-13627690613,43,male,36.25,1,yes,southeast,33681.04
1,NR-12612055325,40,male,34.56,2,no,southeast,20945.63
2,HY-18293606714,48,male,26.04,5,no,southwest,7568.35
3,HF-14244542287,50,male,31.09,3,yes,northwest,32904.57
4,NM-18369314880,42,male,33.04,1,yes,northeast,31770.85
5,KP-15561423566,47,male,24.53,2,no,northeast,4336.27
6,MN-17465674285,63,female,31.8,0,no,southwest,9089.95
7,XU-19691439726,42,female,32.18,1,no,northeast,3997.21
8,UI-12873027740,33,female,29.57,1,no,southeast,2185.26
9,TY-16973081880,64,male,32.18,3,yes,northwest,40240.69


In [3]:
insurence_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5619 entries, 0 to 5618
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Record_ID  5619 non-null   object 
 1   Age        5619 non-null   int64  
 2   Gender     5616 non-null   object 
 3   BMI        5619 non-null   float64
 4   Children   5616 non-null   object 
 5   Smoker     5619 non-null   object 
 6   Region     5619 non-null   object 
 7   Expenses   5619 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 351.3+ KB


# data preprocessing 

In [4]:
#cheching null values

insurence_data.isnull().sum()

Record_ID    0
Age          0
Gender       3
BMI          0
Children     3
Smoker       0
Region       0
Expenses     0
dtype: int64

In [6]:
#checking for duplicate values
insurence_data.duplicated().any()

True

In [7]:
#if is there any duplicate values then drop them 
insurence_data.drop_duplicates()

Unnamed: 0,Record_ID,Age,Gender,BMI,Children,Smoker,Region,Expenses
0,QK-13627690613,43,male,36.25,1,yes,southeast,33681.04
1,NR-12612055325,40,male,34.56,2,no,southeast,20945.63
2,HY-18293606714,48,male,26.04,5,no,southwest,7568.35
3,HF-14244542287,50,male,31.09,3,yes,northwest,32904.57
4,NM-18369314880,42,male,33.04,1,yes,northeast,31770.85
...,...,...,...,...,...,...,...,...
5614,BA-19857717445,48,male,31.10,0,yes,southeast,66993.73
5615,TA-14363035652,54,female,47.40,0,yes,southeast,77348.43
5616,ZA-18021224281,48,male,30.96,0,yes,southeast,86263.34
5617,JA-12676703168,49,male,30.64,0,yes,southeast,86420.79


In [8]:
insurence_data.shape

(5619, 8)

In [10]:
#filling misiimg values 

from sklearn.impute import SimpleImputer

# SimpleImputer(missing_values=nan, strategy='mean', fill_value=None, verbose='deprecated', copy=True, 
# add_indicator=False, keep_empty_features=False)

imputer_si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

insurence_data['Children'] = imputer_si.fit_transform(insurence_data[['Children']])
insurence_data['Gender'] = imputer_si.fit_transform(insurence_data[['Gender']])


In [11]:
insurence_data.isnull().sum()

Record_ID    0
Age          0
Gender       0
BMI          0
Children     0
Smoker       0
Region       0
Expenses     0
dtype: int64

In [12]:
# Character variables Gender to encode by using LabelEncoder

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

insurence_data['Gender'] = le.fit_transform(insurence_data['Gender'])
insurence_data['Children'] = le.fit_transform(insurence_data['Children'])
insurence_data['Smoker'] = le.fit_transform(insurence_data['Smoker'])
insurence_data['Region'] = le.fit_transform(insurence_data['Region'])

In [13]:
#deleting the un influencing columns on target variable
del insurence_data['Record_ID']

In [14]:
insurence_data.head()

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region,Expenses
0,43,1,36.25,1,2,2,33681.04
1,40,1,34.56,2,1,2,20945.63
2,48,1,26.04,5,1,3,7568.35
3,50,1,31.09,3,2,1,32904.57
4,42,1,33.04,1,2,0,31770.85


In [15]:
insurence_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5619 entries, 0 to 5618
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       5619 non-null   int64  
 1   Gender    5619 non-null   int32  
 2   BMI       5619 non-null   float64
 3   Children  5619 non-null   int32  
 4   Smoker    5619 non-null   int32  
 5   Region    5619 non-null   int32  
 6   Expenses  5619 non-null   float64
dtypes: float64(2), int32(4), int64(1)
memory usage: 219.6 KB


In [17]:
# Identify the Independent and Target variables

IndepVar = []
for col in insurence_data.columns:
    if col != 'Expenses':
        IndepVar.append(col)

TargetVar = 'Expenses'

x =insurence_data[IndepVar]
y =insurence_data[TargetVar]

In [18]:
# Split the data into train and test

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=143)

# Display the shape of the train_data and test_data

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3933, 6), (1686, 6), (3933,), (1686,))

In [19]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

In [21]:
# Load the result dataset

RGRResults = pd.read_csv(r"C:\Users\srila\Downloads\ml-intern\categorical\RGRResults.csv", header=0)

RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score


In [22]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge

from sklearn.svm import SVR
#import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
#from sklearn.linear_model import BayesianRidge
#from sklearn.ensemble import BaggingRegressor
#from sklearn.ensemble import GradientBoostingRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

ModelMLR = LinearRegression()
ModelDCR = DecisionTreeRegressor()
ModelRFR = RandomForestRegressor()
ModelETR = ExtraTreesRegressor()
ModelBRR = BayesianRidge()

ModelSVR = SVR()
#modelXGR = xgb.XGBRegressor()
ModelKNN = KNeighborsRegressor(n_neighbors=5)
#modelBRR = BayesianRidge()
#modelBGR = BaggingRegressor()
#modelGBR = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
#                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
#                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
#                                     min_impurity_split=None, init=None, random_state=None, max_features=None,
#                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
#                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

# Evalution matrix for all the algorithms

#MM = [modelmlg, modeldcr, modelrfr, modelSVR, modelXGR, modelKNN, modelETR, modelBRR, modelBGR, modelGBR]
MM = [ModelMLR, ModelDCR, ModelRFR, ModelETR, ModelBRR, ModelSVR, ModelKNN]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    RGRResults = RGRResults.append(new_row, ignore_index=True)

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 4517.814
Mean Squared Error (MSE): 42371675.876
Root Mean Squared Error (RMSE): 6509.353
R2_score: 0.691468
Root Mean Squared Log Error (RMSLE): 8.781
Mean Absolute Percentage Error (MAPE): 54.87 %
Adj R Square:  0.691138
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 2156.752
Mean Squared Error (MSE): 27292105.725
Root Mean Squared Error (RMSE): 5224.185
R2_score: 0.801271
Root Mean Squared Log Error (RMSLE): 8.561
Mean Absolute Percentage Error (MAPE): 31.46 %
Adj R Square:  0.801059
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 1885.8
Mean Squared Error (MSE): 12615016.128
Root Mean Squared Error (RMSE): 3551.762
R2_score: 0.908143
Root Mean Squared Log Error (RMSLE): 8.175
Me

In [23]:
RGRResults.head(10)
##we get better results by using ExtraTreeRegressor,DecisionTreeRegressor based on R2-SCORE,MAE,MSE,MAPE

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score
0,LinearRegression(),4517.81437,0.691138,6509.352954,54.87431,42371675.876282,8.780995,0.691468
1,DecisionTreeRegressor(),2156.751986,0.801059,5224.184695,31.460692,27292105.725249,8.561054,0.801271
2,"(DecisionTreeRegressor(max_features='auto', ra...",1885.800172,0.908045,3551.762397,26.985581,12615016.127714,8.175199,0.908143
3,"(ExtraTreeRegressor(random_state=775594166), E...",1468.947451,0.933106,3029.353098,22.779824,9176980.191981,8.016104,0.933177
4,BayesianRidge(),4518.758025,0.691077,6509.989411,54.890171,42379962.136875,8.781093,0.691407
5,SVR(),7942.770822,-0.143359,12524.105409,77.734149,156853216.283727,9.43541,-0.142138
6,KNeighborsRegressor(),1979.678132,0.896693,3764.616251,25.345335,14172335.514459,8.233401,0.896803


In [24]:
# Predict the values with ET algorithm

y_predF = ModelETR.predict(x_test)

In [25]:
Results = pd.DataFrame({'Expenses_A':y_test, 'Expenses_p':y_predF})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = insurence_data.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region,Expenses,Expenses_A,Expenses_p
4594,42,1,31.42,3,2,0,33871.7,33871.7,32112.4185
2449,52,0,33.83,0,1,2,4010.72,4010.72,4343.0906
4831,46,1,28.57,3,2,1,29907.52,29907.52,24741.0722
613,22,0,28.88,5,1,3,2888.72,2888.72,2755.6003
4041,60,1,35.01,0,1,1,7781.44,7781.44,8491.3478


In [26]:
# Calculate the %of Error

ResultsFinal['%Error'] = round(((ResultsFinal['Expenses_A']-ResultsFinal['Expenses_p'])/ResultsFinal['Expenses_A'])*100,3)

In [27]:
# Display the results

ResultsFinal.sample(5)

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region,Expenses,Expenses_A,Expenses_p,%Error
4946,65,1,30.26,0,1,1,10541.6,10541.6,9607.1661,8.864
4032,22,0,22.73,0,2,1,10475.62,10475.62,10692.4037,-2.069
238,54,0,30.81,1,1,1,5843.36,5843.36,5974.367,-2.242
3476,63,0,26.55,0,1,1,26344.65,26344.65,24604.9461,6.604
226,51,1,30.3,3,2,3,20374.51,20374.51,20592.7759,-1.071
