Model Training


In [4]:
import pandas as pd
df=pd.read_csv('insurance.csv')

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [6]:
X=df.drop(columns=['expenses'])

In [7]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest


In [8]:
y=df[['expenses']]

In [9]:
y.head()

Unnamed: 0,expenses
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86


In [10]:
#define which columns should be encoded and which should be scaled
categorical_col=X.select_dtypes(include='object').columns
numerical_cols=X.select_dtypes(exclude='object').columns

In [11]:
categorical_col

Index(['sex', 'smoker', 'region'], dtype='object')

In [12]:
from sklearn.impute import SimpleImputer ##handling missing values
from sklearn.preprocessing import StandardScaler ## handling feature scaling
from sklearn.preprocessing import OneHotEncoder ## encoding categorical features

##pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [13]:
## numerical pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]

)
#catagorical pipeline
categorical_Pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('OneHotEncoder',OneHotEncoder(sparse=False)),
    ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',categorical_Pipeline,categorical_col)
])

In [14]:
## train test split

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [15]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())



In [16]:
x_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__bmi,num_pipeline__children,cat_pipeline__sex_female,cat_pipeline__sex_male,cat_pipeline__smoker_no,cat_pipeline__smoker_yes,cat_pipeline__region_northeast,cat_pipeline__region_northwest,cat_pipeline__region_southeast,cat_pipeline__region_southwest
0,0.472227,-1.748572,0.734336,1.024602,-1.024602,0.508747,-0.508747,-0.576631,1.783168,-0.599661,-0.572314
1,0.543313,-1.036704,-0.911192,1.024602,-1.024602,0.508747,-0.508747,1.734212,-0.5608,-0.599661,-0.572314
2,0.898745,-0.937373,-0.911192,1.024602,-1.024602,0.508747,-0.508747,-0.576631,-0.5608,1.667609,-0.572314
3,-0.025379,0.618804,3.202629,1.024602,-1.024602,0.508747,-0.508747,-0.576631,-0.5608,1.667609,-0.572314
4,1.040918,-1.500246,1.5571,1.024602,-1.024602,0.508747,-0.508747,-0.576631,1.783168,-0.599661,-0.572314


In [21]:
## model training

from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

In [19]:
import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    return mae

In [22]:
#train multiple models
#model evaluation
models={
    'LinearRegression':LinearRegression(),
    'ElasticNet':ElasticNet(),
    'RandomForestRegressor':RandomForestRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor()
}
trained_model_list=[]
model_list=[]
MAE_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)

    #make predictions
    y_pred=model.predict(x_test)
    mae=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print(mae)

    MAE_list.append(mae)

    print('='*35)
    print('\n')

LinearRegression
4210.797380074843


ElasticNet
4821.766820054969




  return fit_method(estimator, *args, **kwargs)


RandomForestRegressor
2576.4921389179112


DecisionTreeRegressor
2926.9019029850747


