In [16]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer

import warnings
warnings.filterwarnings("ignore")


In [17]:
df = pd.read_csv(r'C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Cases\Medical Cost Personal\insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
X = df.drop(columns=['charges'], axis=1)
y = df['charges']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [21]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 936 entries, 476 to 418
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       936 non-null    int64  
 1   sex       936 non-null    object 
 2   bmi       936 non-null    float64
 3   children  936 non-null    int64  
 4   smoker    936 non-null    object 
 5   region    936 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 51.2+ KB


GridSearch: Creating pipelines so that we can use ColumnTransformer

In [22]:
ohe = OneHotEncoder(
    sparse_output=False, 
    drop='first'
).set_output(transform='pandas')


ct = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=[
        'object',
        'category',
    ])),
    (ohe, make_column_selector(dtype_include=[
        'object',
        'category',
    ])),
    verbose_feature_names_out=False,
).set_output(transform='pandas')


In [None]:
lr = LinearRegression()
el = ElasticNet(random_state=24)
dtr = DecisionTreeRegressor(random_state=24)

pipe_lr = Pipeline([
    ('CT', ct),
    ('LR', lr),
])

pipe_el = Pipeline([
    ('CT', ct),
    ('EL', el),
])

pipe_dtr = Pipeline([
    ('CT', ct),
    ('DT', dtr),
])


In [26]:
#Using pipelines as estimators
voting = VotingRegressor([('DTR',pipe_dtr),('EN',pipe_el),('LR',pipe_lr)])
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test, y_pred))

0.7495467414729552


In [27]:
#VotingRegressor with weights for different estimators
voting = VotingRegressor([('DTR',pipe_dtr),('EN',pipe_el),('LR',pipe_lr)], weights=[ 0.7, 0.15, 0.15])
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test, y_pred))

0.7605378297033076


In [None]:
#Using pipelines as estimators
voting = VotingRegressor([('DTR',pipe_dtr),('EN',pipe_el),('LR',pipe_lr)])
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test, y_pred))

Using algorithms with different parameters

In [37]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=24)

In [44]:
from sklearn.model_selection import GridSearchCV

lr = LinearRegression()

pipe_lr = Pipeline([
    ('CT', ct),
    ('LR', lr),
])


gcv_lr = GridSearchCV(
    estimator=pipe_lr,
    cv=kfold,
    param_grid={},
    scoring='r2',
    
)

gcv_lr.fit(X_train, y_train)

y_pred = gcv_lr.best_estimator_.predict(X_test)
print("r2_score: ", r2_score(y_test, y_pred))


r2_score:  0.7665391799816874


In [49]:
el = ElasticNet(random_state=24)

pipe_el = Pipeline([
    ('ct', ct),
    ('EL', el),
])

params = {
    'EL__l1_ratio': [0.2, 0.4, 0.6, 0.8, 1.0],
    'EL__alpha': np.linspace(0.001, 5, 20),
}
gcv_el = GridSearchCV(
    estimator=pipe_el,
    param_grid=params,
    cv=kfold,
    scoring='r2'
)

gcv_el.fit(X_train, y_train)

y_pred = gcv_el.best_estimator_.predict(X_test)
print("r2_score: ", r2_score(y_test, y_pred))

r2_score:  0.7660471539571693


In [48]:
dtr = DecisionTreeRegressor(random_state=24)

pipe_dtr = Pipeline([
    ('ct', ct),
    ('DT', dtr),
])

params = {
    'DT__max_depth': [None, 2, 3],
    'DT__min_samples_leaf':  [2, 10, 20],
    'DT__min_samples_split':  [1, 10, 20],
}

gcv_dtr = GridSearchCV(
    estimator=pipe_dtr,
    param_grid=params,
    cv=kfold,
    scoring='r2'
)

gcv_dtr.fit(X_train, y_train)

y_pred = gcv_dtr.best_estimator_.predict(X_test)
print("r2_score: ", r2_score(y_test, y_pred))

r2_score:  0.8709966420482848
