In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

### STEP-1 Data Collection

In [2]:
path = os.path.join('../data/', 'insurance.csv')
df = pd.read_csv(path)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### STEP-2 PREPROCESSING

In [3]:
# Checking missing values
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
# Seprating X and y
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [5]:
# Extracting numeric and categorical columns
num_col = X.select_dtypes(exclude='object').columns
cate_col = X.select_dtypes(include='object').columns

In [6]:
num_pipeline = Pipeline(
    [
        ('StandardScaler', StandardScaler())
    ]
)
cate_pipeline = Pipeline(
    [
        ('OneHotEncoder', OneHotEncoder())
    ]
)
preprocessor = ColumnTransformer(
    [
        ('num_pipline', num_pipeline, num_col),
        ('cate_pipline', cate_pipeline, cate_col)
    ]
)
X_df = preprocessor.fit_transform(X)
y = np.log(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_df,y, test_size=.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 11), (268, 11), (1070,), (268,))

In [8]:
models = {
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "XGBRegressor":XGBRegressor(),
    "CatBoostRegressor":CatBoostRegressor(verbose=False)
}

In [9]:
# Model training and train data evaluation
def model_trian(X_train, y_train, models):
    model_name = []
    train_r2 = []
    train_mse = []
    train_rmse = []
    train_mape = []
    train_acc = []
    train_result_df = pd.DataFrame()
    trained_models = {}

    for each in list(models.keys()):
        model = models[each]
        model.fit(X_train, y_train)
        pred = model.predict(X_train)
        model_name.append(each)
        train_r2.append(r2_score(y_train, pred))
        train_mse.append(mean_squared_error(y_train, pred))
        train_rmse.append(mean_squared_error(y_train, pred, squared=False))
        train_mape.append(mean_absolute_percentage_error(y_train, pred))
        train_acc.append(100 - mean_absolute_percentage_error(y_train, pred)*100)
        trained_models[each] = model

    train_result_df["Model_name"] = model_name
    train_result_df["r2_score"] = train_r2
    train_result_df["mse"] = train_mse
    train_result_df["rmse"] = train_rmse
    train_result_df["mape"] = train_mape
    train_result_df["accuracy"] = train_acc
    
    return (train_result_df, trained_models)

In [10]:
train_result_df, trained_models = model_trian(X_train, y_train, models)
train_result_df.sort_values("accuracy", ascending=False)

Unnamed: 0,Model_name,r2_score,mse,rmse,mape,accuracy
4,DecisionTreeRegressor,0.997422,0.002178,0.046667,0.000242,99.975807
7,XGBRegressor,0.994476,0.004666,0.068309,0.003677,99.632277
5,RandomForestRegressor,0.973732,0.022187,0.148954,0.00795,99.20505
8,CatBoostRegressor,0.951044,0.04135,0.203347,0.011693,98.830719
3,KNeighborsRegressor,0.845985,0.130087,0.360676,0.023765,97.623547
0,LinearRegression,0.770381,0.193946,0.440393,0.029436,97.056447
2,Ridge,0.770376,0.19395,0.440397,0.029469,97.053146
6,AdaBoostRegressor,0.747126,0.213587,0.462155,0.040554,95.944598
1,Lasso,0.0,0.844641,0.919044,0.083769,91.623078


In [11]:
# Model testing and test data evaluation
def model_evaluation(X_test, y_test, trained_models):
    model_name = []
    test_r2 = []
    test_mse = []
    test_rmse = []
    test_mape = []
    test_acc = []
    test_result_df = pd.DataFrame()

    for each in list(trained_models.keys()):
        model = trained_models[each]
        pred = model.predict(X_test)
        model_name.append(each)
        test_r2.append(r2_score(y_test, pred))
        test_mse.append(mean_squared_error(y_test, pred))
        test_rmse.append(mean_squared_error(y_test, pred, squared=False))
        test_mape.append(mean_absolute_percentage_error(y_test, pred))
        test_acc.append(100 - mean_absolute_percentage_error(y_test, pred)*100)

    test_result_df["Model_name"] = model_name
    test_result_df["r2_score"] = test_r2
    test_result_df["mse"] = test_mse
    test_result_df["rmse"] = test_rmse
    test_result_df["mape"] = test_mape
    test_result_df["accuracy"] = test_acc
    
    return test_result_df 

In [12]:
test_result_df = model_evaluation(X_test, y_test, trained_models)
test_result_df.sort_values('accuracy', ascending=False)

Unnamed: 0,Model_name,r2_score,mse,rmse,mape,accuracy
5,RandomForestRegressor,0.785934,0.180215,0.424517,0.023501,97.649868
8,CatBoostRegressor,0.79663,0.17121,0.413775,0.023506,97.64942
4,DecisionTreeRegressor,0.692832,0.258594,0.508522,0.025502,97.449825
7,XGBRegressor,0.767232,0.19596,0.442673,0.026487,97.351268
0,LinearRegression,0.753518,0.207505,0.455527,0.03049,96.951043
2,Ridge,0.753403,0.207602,0.455633,0.030519,96.948122
3,KNeighborsRegressor,0.75056,0.209995,0.458252,0.030879,96.912054
6,AdaBoostRegressor,0.714447,0.240397,0.490303,0.041941,95.805883
1,Lasso,-0.00603,0.846942,0.920294,0.081882,91.81178


In [13]:
# best model name and result
print(test_result_df[test_result_df['accuracy'] == test_result_df['accuracy'].max()])

              Model_name  r2_score       mse      rmse      mape   accuracy
5  RandomForestRegressor  0.785934  0.180215  0.424517  0.023501  97.649868


- Conclusion all the models are perfoming good and the best preforming model is RandomforestRegressor with accuracy of 97.61%.