## Random Forest Regressor With Pipeline And Hyperparameter Tuning

In [1]:
import seaborn as sns
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
## independent and dependent feature
x = df.drop(labels=['total_bill'],axis=1)
y = df.total_bill

In [4]:
x.head()

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.5,Male,No,Sun,Dinner,3
3,3.31,Male,No,Sun,Dinner,2
4,3.61,Female,No,Sun,Dinner,4


In [5]:
y.head()

0    16.99
1    10.34
2    21.01
3    23.68
4    24.59
Name: total_bill, dtype: float64

In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.20,
                                                 random_state=42)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  ## Handel Missing Values
from sklearn.preprocessing import StandardScaler  ## Feature scaling
from sklearn.preprocessing import OneHotEncoder   ## Categorical to Numerical
from sklearn.compose import ColumnTransformer

In [8]:
categorical_cols = ['sex','smoker','day','time']
numerical_cols = ['tip','size']

In [13]:
## feature Engineering Automation (Create Pipeline)
## Numerical Pipelines
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),   ##Missing Values
        ('scaler',StandardScaler())              ## feature Scaling
    ]
)

#categorical Pipeline

cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),   ##Hnadel Missing Values
        ('onehotenceder',OneHotEncoder())  ## Categorical features to numerical
    ]
)

In [14]:
preprocessor = ColumnTransformer([
    ('Numerical_pipe',num_pipeline,numerical_cols),
    ('Categorical_pipe',cat_pipeline,categorical_cols)
]
)
preprocessor

In [15]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [16]:
x_train

array([[-0.2580329 , -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.74211442, -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.6399734 , -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.46472887, -0.61214068,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.32426806, -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.41237773,  0.45363997,  1.        , ...,  0.        ,
         1.        ,  0.        ]])

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [37]:
models = {
    'Random Forest Regressor' : RandomForestRegressor(),
    #'Logistic Regression' : LogisticRegression(),
    #'Decision Tree' : DecisionTreeClassifier()
}

In [27]:
from sklearn.metrics import r2_score

In [35]:
def evaluate_model(x_train,y_train,x_test,y_test,models):

    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        # Train model
        model.fit(x_train,y_train)

        # Predict Testing data
        y_test_pred = model.predict(x_test)

        # Get accuracy for test data prediction

        test_model_score = r2_score(y_test,y_test_pred)

        report[list(models.keys())[i]] = test_model_score

    return report

In [38]:
evaluate_model(x_train,y_train,x_test,y_test,models)

{'Random Forest Regressor': 0.4857024716792352}

In [39]:
regrassor = RandomForestRegressor()

In [40]:
## Hypeparameter Tuning
params={'max_depth':[3,5,10,None],
              'n_estimators':[100,200,300],
               'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
              }

In [41]:
from sklearn.model_selection import RandomizedSearchCV

In [42]:
cv = RandomizedSearchCV(regrassor,param_distributions=params,
                        scoring='explained_variance',cv=5,verbose=3)
cv.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=friedman_mse, max_depth=None, n_estimators=200;, score=0.328 total time=   0.3s
[CV 2/5] END criterion=friedman_mse, max_depth=None, n_estimators=200;, score=0.691 total time=   0.2s
[CV 3/5] END criterion=friedman_mse, max_depth=None, n_estimators=200;, score=0.582 total time=   0.2s
[CV 4/5] END criterion=friedman_mse, max_depth=None, n_estimators=200;, score=0.328 total time=   0.2s
[CV 5/5] END criterion=friedman_mse, max_depth=None, n_estimators=200;, score=0.075 total time=   0.2s
[CV 1/5] END criterion=squared_error, max_depth=10, n_estimators=200;, score=0.348 total time=   0.2s
[CV 2/5] END criterion=squared_error, max_depth=10, n_estimators=200;, score=0.677 total time=   0.2s
[CV 3/5] END criterion=squared_error, max_depth=10, n_estimators=200;, score=0.588 total time=   0.3s
[CV 4/5] END criterion=squared_error, max_depth=10, n_estimators=200;, score=0.341 total time=   0.2s
[CV 5/5] END cri

In [43]:
cv.best_params_

{'n_estimators': 200, 'max_depth': 5, 'criterion': 'poisson'}

In [44]:
regrassor = RandomForestRegressor(n_estimators=200,max_depth=5,
                                  criterion='poisson')

In [45]:
regrassor.fit(x_train,y_train)

In [46]:
y_pred = regrassor.predict(x_test)

In [47]:
r2_score(y_test,y_pred)

0.5674438943726507