In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

### STEP-1 Data Collection

In [2]:
path = os.path.join('../data/', 'insurance.csv')
df = pd.read_csv(path)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### STEP-2 PREPROCESSING

In [3]:
# Checking missing values
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
# Seprating X and y
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [5]:
# Extracting numeric and categorical columns
num_col = X.select_dtypes(exclude='object').columns
cate_col = X.select_dtypes(include='object').columns

In [6]:
num_pipeline = Pipeline(
    [
        ('StandardScaler', StandardScaler())
    ]
)
cate_pipeline = Pipeline(
    [
        ('OneHotEncoder', OneHotEncoder())
    ]
)
preprocessor = ColumnTransformer(
    [
        ('num_pipline', num_pipeline, num_col),
        ('cate_pipline', cate_pipeline, cate_col)
    ]
)
X_df = preprocessor.fit_transform(X)
y = np.log(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_df,y, test_size=.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 11), (268, 11), (1070,), (268,))

In [8]:
models = {
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "XGBRegressor":XGBRegressor(),
    "CatBoostRegressor":CatBoostRegressor(verbose=False)
}

In [9]:
# Model training and train data evaluation
def model_trian(X_train, y_train, models):
    model_name = []
    train_r2 = []
    train_mse = []
    train_rmse = []
    train_mape = []
    train_acc = []
    train_result_df = pd.DataFrame()
    trained_models = {}

    for each in list(models.keys()):
        model = models[each]
        model.fit(X_train, y_train)
        pred = model.predict(X_train)
        model_name.append(each)
        train_r2.append(r2_score(y_train, pred))
        train_mse.append(mean_squared_error(y_train, pred))
        train_rmse.append(mean_squared_error(y_train, pred, squared=False))
        train_mape.append(mean_absolute_percentage_error(y_train, pred))
        train_acc.append(100 - mean_absolute_percentage_error(y_train, pred)*100)
        trained_models[each] = model

    train_result_df["Model_name"] = model_name
    train_result_df["r2_score"] = train_r2
    train_result_df["mse"] = train_mse
    train_result_df["rmse"] = train_rmse
    train_result_df["mape"] = train_mape
    train_result_df["accuracy"] = train_acc
    
    return (train_result_df, trained_models)

In [10]:
train_result_df, trained_models = model_trian(X_train, y_train, models)
train_result_df.sort_values("accuracy", ascending=False)

Unnamed: 0,Model_name,r2_score,mse,rmse,mape,accuracy
4,DecisionTreeRegressor,0.994506,0.004587,0.06773,0.000486,99.951377
7,XGBRegressor,0.991195,0.007351,0.085739,0.003629,99.63715
5,RandomForestRegressor,0.97445,0.021332,0.146056,0.007602,99.239779
8,CatBoostRegressor,0.955982,0.036752,0.191709,0.010761,98.923879
3,KNeighborsRegressor,0.865668,0.112159,0.334902,0.022792,97.720783
0,LinearRegression,0.779867,0.183798,0.428716,0.029254,97.074617
2,Ridge,0.779863,0.183802,0.428721,0.029286,97.071431
6,AdaBoostRegressor,0.720047,0.233744,0.483471,0.045011,95.498918
1,Lasso,0.0,0.83494,0.913751,0.082719,91.728122


In [11]:
# Model testing and test data evaluation
def model_evaluation(X_test, y_test, trained_models):
    model_name = []
    test_r2 = []
    test_mse = []
    test_rmse = []
    test_mape = []
    test_acc = []
    test_result_df = pd.DataFrame()

    for each in list(trained_models.keys()):
        model = trained_models[each]
        pred = model.predict(X_test)
        model_name.append(each)
        test_r2.append(r2_score(y_test, pred))
        test_mse.append(mean_squared_error(y_test, pred))
        test_rmse.append(mean_squared_error(y_test, pred, squared=False))
        test_mape.append(mean_absolute_percentage_error(y_test, pred))
        test_acc.append(100 - mean_absolute_percentage_error(y_test, pred)*100)

    test_result_df["Model_name"] = model_name
    test_result_df["r2_score"] = test_r2
    test_result_df["mse"] = test_mse
    test_result_df["rmse"] = test_rmse
    test_result_df["mape"] = test_mape
    test_result_df["accuracy"] = test_acc
    
    return test_result_df 

In [18]:
test_result_df = model_evaluation(X_test, y_test, trained_models)
test_result_df.sort_values('accuracy', ascending=False)

Unnamed: 0,Model_name,r2_score,mse,rmse,mape,accuracy
5,RandomForestRegressor,0.744747,0.225796,0.47518,0.023805,97.619462
8,CatBoostRegressor,0.747507,0.223355,0.472605,0.023889,97.611089
4,DecisionTreeRegressor,0.632428,0.325153,0.570222,0.026487,97.351313
7,XGBRegressor,0.716295,0.250965,0.500964,0.027505,97.249547
0,LinearRegression,0.722066,0.24586,0.495843,0.03329,96.67098
2,Ridge,0.722241,0.245705,0.495687,0.033306,96.669399
3,KNeighborsRegressor,0.672177,0.289991,0.538508,0.034397,96.560268
6,AdaBoostRegressor,0.651972,0.307865,0.554856,0.049799,95.020126
1,Lasso,-7.9e-05,0.884669,0.940568,0.086416,91.358357


In [21]:
# best model name and result
print(test_result_df[test_result_df['accuracy'] == test_result_df['accuracy'].max()])

              Model_name  r2_score       mse     rmse      mape   accuracy
5  RandomForestRegressor  0.744747  0.225796  0.47518  0.023805  97.619462


- Conclusion all the models are perfoming good and the best preforming model is RandomforestRegressor with accuracy of 97.61%.