# Random Forest

The main purpose of this file serves to run hyperparamter tuning to find the best settings for the chosen model, which will then be transferred onto the main `models.ipynb` file.

In [24]:
# IMPORTS
from utils import *
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier



### Data

In [25]:
# READ AND SPLIT DATA
df = pd.read_pickle("../../datasets/pickle/processed_action_movie_data.pkl")

X, y = df.drop("rating", axis=1), df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = catboost_encoding(X_train, X_test, y_train, y_test)

In [26]:
param_grid = {
    'n_estimators': np.arange(100, 1000, 100),
    'max_depth': [3, 5, 7, 9, 11, None],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11)
}

In [None]:
# Create a kNN classifier object
cla = RandomForestClassifier()

# Create a GridSearchCV object
grid_search = RandomizedSearchCV(estimator = cla, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring="f1_macro")

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

In [28]:
# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'n_estimators': 600, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None}
Best score:  0.6030858550359021
