# Random forest

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Load the data
X_train = np.load("data/regression_data/X_train_batch1.npy")
y_train = np.load('data/regression_data/y1_train_batch1.npy')
X_val = np.load("data/regression_data/X_val_batch1.npy")
y_val = np.load('data/regression_data/y1_val_batch1.npy')
X_test = np.load("data/regression_data/X_test_batch1.npy")
y_test = np.load('data/regression_data/y1_test_batch1.npy')

In [4]:
X_train = np.delete(X_train,-3,axis=1)
X_val = np.delete(X_val,-3,axis=1)
X_test = np.delete(X_test,-3,axis=1)

### Ridge regression

In [None]:
alphas = [0.01, 0.1, 1, 10, 100]  # Try different regularization strengths
best_alpha = None
best_mse = float("inf")

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    val_mse = mean_squared_error(y_val, ridge.predict(X_val))
    
    if val_mse < best_mse:
        best_mse = val_mse
        best_alpha = alpha

# 5. Train final model with best alpha
final_model = Ridge(alpha=best_alpha)
final_model.fit(X_train, y_train)

# 6. Evaluate model on test set
y_pred = final_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print(f"Best Alpha: {best_alpha}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test r^2: {r2}")

### Random forest

1. Simple random forest only using training set and test set

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=150, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Calculate and print MSE and R²
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse}")
print(f"Test R²: {r2}")

2. Random forest (2nd try)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with cross-validation and validation set
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model on the training set, using the validation set for tuning
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
print(f"Best parameters found: {grid_search.best_params_}")

# Train the model with the best hyperparameters
best_rf_model = grid_search.best_estimator_

# Now, you can evaluate on the validation set if needed
y_val_pred = best_rf_model.predict(X_val)

3. Random Forest with hyperparameter tuning using validation Set

In [5]:
print("\n### Running Manual Hyperparameter Tuning Using Validation Set for Random Forest ###")

# Define hyperparameter candidates
n_estimators = [50, 100, 200]
max_depth = [5, 10, 20, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

best_n_estimators = None
best_max_depth = None
best_min_samples_split = None
best_min_samples_leaf = None
best_mse = float("inf")

# Loop through all hyperparameter combinations
for n in n_estimators:
    for depth in max_depth:
        for split in min_samples_split:
            for leaf in min_samples_leaf:
                model = RandomForestRegressor(
                    n_estimators=n,
                    max_depth=depth,
                    min_samples_split=split,
                    min_samples_leaf=leaf,
                    random_state=42
                )
                model.fit(X_train, y_train)
                val_pred = model.predict(X_val)
                mse = mean_squared_error(y_val, val_pred)
                
                # Store best model
                if mse < best_mse:
                    best_mse = mse
                    best_n_estimators = n
                    best_max_depth = depth
                    best_min_samples_split = split
                    best_min_samples_leaf = leaf

print(f"Best n_estimators (Validation Set): {best_n_estimators}")
print(f"Best max_depth (Validation Set): {best_max_depth}")
print(f"Best min_samples_split (Validation Set): {best_min_samples_split}")
print(f"Best min_samples_leaf (Validation Set): {best_min_samples_leaf}")
print(f"Best Validation MSE (Manual Search): {best_mse}")

# Train final model with best hyperparameters from validation set
final_model_rf = RandomForestRegressor(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf,
    random_state=42
)
final_model_rf.fit(X_train, y_train)

# Evaluate final model on test set
test_pred_rf = final_model_rf.predict(X_test)
test_mse_rf = mean_squared_error(y_test, test_pred_rf)
test_r2_rf = r2_score(y_test, test_pred_rf)

print("Test MSE (Manual Search with Random Forest):", test_mse_rf)
print("Test R2 Score (Manual Search with Random Forest):", test_r2_rf)


### Running Manual Hyperparameter Tuning Using Validation Set for Random Forest ###
Best n_estimators (Validation Set): 200
Best max_depth (Validation Set): 20
Best min_samples_split (Validation Set): 10
Best min_samples_leaf (Validation Set): 4
Best Validation MSE (Manual Search): 0.20897156577162904
Test MSE (Manual Search with Random Forest): 0.18908576852427333
Test R2 Score (Manual Search with Random Forest): 0.18679060702897998


In [13]:
train_pred_rf = final_model_rf.predict(X_train)
train_mse_rf = mean_squared_error(y_train, train_pred_rf)
train_r2_rf = r2_score(y_train, train_pred_rf)
print(train_r2_rf)
print(train_mse_rf)

0.20028978762412475
0.21072677951292806


In [15]:
val_pred_rf = final_model_rf.predict(X_val)
val_mse_rf = mean_squared_error(y_val, val_pred_rf)
val_r2_rf = r2_score(y_val, val_pred_rf)
print(val_mse_rf)
print(val_r2_rf)


0.20897156577162904
0.17804271253845716
