In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
num_features = ["age", "income"]
cat_features = ["gender", "city"]

In [3]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [5]:
preprocessor = ColumnTransformer(([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
]))

In [6]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [7]:
param_grid = {
 'regressor__n_estimators': [50, 100, 200],
 'regressor__max_depth': [None, 10, 20],
 'regressor__min_samples_split': [2, 5]
}

In [8]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
import pandas as pd

# Convert NumPy arrays to DataFrames with generic column names
X_train_df = pd.DataFrame(X_train, columns=[f'feature_{i}' for i in range(X_train.shape[1])])
X_test_df = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(X_test.shape[1])])

# Update the feature names to match the DataFrame columns
num_features = [f'feature_{i}' for i in range(len(num_features))]
cat_features = [f'feature_{i}' for i in range(len(num_features), X_train.shape[1])]

# Recreate the preprocessor with updated feature names
preprocessor = ColumnTransformer(([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
]))

# Recreate the pipeline with the updated preprocessor
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

grid_search = GridSearchCV(pipe, param_grid, cv=5,
scoring='r2', n_jobs=-1)
grid_search.fit(X_train_df, y_train)

In [11]:
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
Best R2 Score: 0.2302293307078509
