In [6]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


In [2]:
import seaborn as sns


In [3]:
df=sns.load_dataset('tips')


In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
X=df.iloc[:,1:]
y=df['total_bill']

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)


In [9]:
## Pieplining for the numericals in X
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

In [10]:
from sklearn import set_config


In [11]:
set_config(display='diagram')


In [12]:
numeric_preprocessor


In [13]:
categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [14]:
categorical_preprocessor

In [20]:
preprocessor=ColumnTransformer(
   [( "categorical",categorical_preprocessor,['sex','smoker','day','time']),
     ("numerical",numeric_preprocessor,['tip','size']),])

In [21]:
preprocessor

In [22]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor())]

)

In [23]:
pipe

In [24]:
pipe.fit(X_train,y_train)


In [25]:
pipe.predict(X_test)

array([21.61511   , 20.73435   , 14.54339333, 17.1546    , 25.2435    ,
       22.0813    , 15.3086    , 21.82701667, 24.37096667, 19.29635   ,
       13.87183333, 30.5057    , 11.38978167, 30.6734    , 15.7734    ,
       25.83386667, 12.12560833, 30.9281    , 20.7885    , 14.15039   ,
       22.3494    , 19.2466    , 13.8575    , 29.1572    , 19.101075  ,
       14.67403333, 19.744     , 12.16348   , 13.73894889, 27.1794    ,
       15.3086    , 15.31915   , 20.1731    , 27.97966667, 11.38978167,
       18.03446   , 17.41834167, 18.5511    , 11.74392   , 30.6734    ,
       16.6642    , 11.38978167, 22.3494    , 18.93554048, 26.4772    ,
       24.7206    , 31.9281    , 17.63545   , 12.16348   ])

In [30]:
param_grid = {
    'regressor__n_estimators': [100,500],
    'regressor__max_features': ['auto','sqrt','log2'],
    'regressor__max_depth': [3,4,5,6,7,8]    
}

In [31]:
gridsearch= GridSearchCV(pipe,param_grid,n_jobs=1)

In [32]:
gridsearch.fit(X_train,y_train)

In [33]:
gridsearch.best_params_

{'regressor__max_depth': 6,
 'regressor__max_features': 'log2',
 'regressor__n_estimators': 500}

In [36]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor(max_depth= 6,
 max_features= 'log2',
 n_estimators= 500))]

)

In [37]:
pipe

In [38]:
pipe.fit(X_train,y_train)

In [39]:
pipe.predict(X_test)

array([22.07710487, 20.71011156, 18.07706855, 19.02147249, 24.44723596,
       21.07170395, 17.01285916, 21.91163583, 24.07594461, 17.66956308,
       17.67552894, 30.88138293, 12.28020437, 28.87966575, 15.70130053,
       24.94578639, 12.95086972, 29.64112668, 20.44743973, 16.63508518,
       24.25194588, 19.24026546, 15.41103323, 28.9608936 , 20.36615913,
       14.4348877 , 18.71136816, 13.67098321, 15.21135048, 24.71538517,
       17.01285916, 17.03218826, 20.00268722, 24.81499199, 12.28020437,
       19.6647839 , 17.14137094, 18.461655  , 15.68346879, 28.9147418 ,
       18.94465828, 12.28020437, 24.25194588, 16.29452935, 25.00503892,
       24.49924398, 31.19081217, 16.17587421, 13.67098321])