In [32]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [33]:
import seaborn as sns

In [34]:
df=sns.load_dataset('tips')

In [35]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [36]:
X = df.iloc[:,1:]
y = df['total_bill']

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [39]:
## Pipeline
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values = np.nan,strategy = "mean")),
        ("scaler",StandardScaler())
    ])

In [40]:
categorical_preprocessor = Pipeline(
    steps = [
        (
            "imputation_constant",
            SimpleImputer(fill_value = "missing",strategy = "constant"),
        ),
        ("onehot",OneHotEncoder(handle_unknown = "ignore")),
        
    ])

In [41]:
preprocessor = ColumnTransformer(
    [
        ("categorical",categorical_preprocessor,["sex","smoker","day","time"]),
        ("numerical", numeric_preprocessor,["tip","size"]),
    ])

In [42]:
pipe = Pipeline(
    steps = [("preprocessor",preprocessor),("regressor",RandomForestRegressor())])

In [43]:
from sklearn import set_config

In [44]:
set_config(display="diagram")

In [45]:
pipe

In [46]:
pipe.fit(X_train,y_train)

In [47]:
pipe.predict(X_test)

array([18.421     , 10.859245  , 25.4024    , 20.96855   , 32.0237    ,
       21.9265    , 21.03999333, 26.0033    , 43.2957    , 13.31833056,
       28.6803    , 20.9284    ,  9.2725    , 30.5668    , 19.57493333,
       34.4798    , 25.2558    , 26.6841    , 20.48674167, 14.22548   ,
       25.231     , 23.149575  , 17.9059    , 15.02761667, 18.9671    ,
       16.95148333, 29.8858    , 19.3762    , 19.82848333,  7.2588    ,
       20.0961    , 12.49004702, 10.859245  , 15.6291    , 12.49004702,
       17.3395    , 13.60926571, 31.0274    , 12.1367    , 18.095     ,
       35.9026    , 17.9579    , 20.9768    , 15.6328    , 12.418     ,
       20.136     , 19.577     , 17.5957    , 20.8064    ])

In [48]:
import warnings
warnings.filterwarnings('ignore')

In [49]:
## Hyperparameter Tuning
param_grid = {
    "regressor__n_estimators":[200,500],
    "regressor__max_features":["auto","sqrt","log2"],
    "regressor__max_depth":[4,5,6,7,8]
}

In [50]:
grid_search = GridSearchCV(pipe,param_grid = param_grid,n_jobs=-1)

In [51]:
grid_search.fit(X_train,y_train)

In [52]:
grid_search.best_params_

{'regressor__max_depth': 5,
 'regressor__max_features': 'auto',
 'regressor__n_estimators': 200}

In [53]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor(max_depth =6,
                                                                     max_features='sqrt',
                                                                     n_estimators=200))]

)

In [54]:
pipe.fit(X_train,y_train)

In [55]:
pipe.predict(X_test)

array([20.02049505, 14.67985887, 20.55151639, 17.21720332, 29.80153177,
       21.45589821, 16.40006669, 25.55698842, 30.09882207, 17.9159065 ,
       27.79943249, 20.38327841, 12.99175599, 28.27894831, 17.59940538,
       27.67652486, 25.78152953, 23.69626615, 21.64003535, 14.45711053,
       25.79757953, 22.829544  , 17.35644649, 16.79802897, 20.2213212 ,
       17.4430176 , 28.17071159, 21.81240056, 17.39034709, 13.65982371,
       22.26334894, 12.74121009, 14.67985887, 14.49605051, 12.74121009,
       19.53946784, 15.91835491, 29.61341251, 13.37428314, 18.26605386,
       30.74650224, 18.85792747, 20.30622183, 17.44881513, 17.86070291,
       19.71116434, 20.01002731, 16.93573217, 20.4010688 ])