In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pickle

# Load the data
X_train = pd.read_csv('../data/processed/X_train_scaled.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
rf = RandomForestRegressor(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='r2'
)

# Fit the grid search
grid_search.fit(X_train, y_train.values.ravel())

# Save best parameters
with open('../models/best_params.pkl', 'wb') as f:
    pickle.dump(grid_search.best_params_, f)

In [8]:
# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))

# Get the results into a dataframe for better visualization
results = pd.DataFrame(grid_search.cv_results_)

# Select the most important columns
important_columns = ['mean_test_score', 'std_test_score', 'params']
results_summary = results[important_columns].sort_values(by='mean_test_score', ascending=False)

# Display top 5 parameter combinations
print("\nTop 5 parameter combinations:")
print(results_summary.head())

Best parameters: {'max_depth': 30, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
Best cross-validation score: 0.194

Top 5 parameter combinations:
     mean_test_score  std_test_score  \
77          0.194121        0.053552   
104         0.194121        0.053552   
101         0.194121        0.053552   
74          0.194121        0.053552   
47          0.194114        0.053532   

                                                params  
77   {'max_depth': 30, 'min_samples_leaf': 4, 'min_...  
104  {'max_depth': None, 'min_samples_leaf': 4, 'mi...  
101  {'max_depth': None, 'min_samples_leaf': 4, 'mi...  
74   {'max_depth': 30, 'min_samples_leaf': 4, 'min_...  
47   {'max_depth': 20, 'min_samples_leaf': 4, 'min_...  


In [5]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestRegressor

# Load best parameters
with open('../models/best_params.pkl', 'rb') as f:
    best_params = pickle.load(f)

# Load the data
X_train = pd.read_csv('../data/processed/X_train_scaled.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')

# Create and train the model with best parameters
rf_model = RandomForestRegressor(**best_params, random_state=42)
rf_model.fit(X_train, y_train.values.ravel())

# Save the trained model
with open('../models/gbr_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)