# Hyperparameterized Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

train_data = pd.read_csv('geckoq_data/train.csv')
test_data = pd.read_csv('geckoq_data/test.csv')

X_train = train_data.drop(columns=['ID','log_pSat_Pa'])
X_train = pd.get_dummies(X_train)
y_train = train_data['log_pSat_Pa']
X_test = test_data.drop(columns=['ID'])
X_test = pd.get_dummies(X_test)

for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

# Define the model
model = RandomForestRegressor(random_state=15)

# Define the hyperparameters and their values for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=3, n_jobs=1, verbose=2, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the model with the best parameters
best_model = grid_search.best_estimator_

# Predict on the test set
predict = best_model.predict(X_test)

# Save the results
results = pd.DataFrame({
    'ID': test_data['ID'],
    'TARGET': predict
})

# For now the best settings: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
results.to_csv('rf_submission.csv', index=False)