<a href="https://colab.research.google.com/github/santiago2588/distillation_column_training/blob/main/Soluciones_colab/04_random_forest_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data wrangling
import pandas as pd
import numpy as np

In [2]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

In [3]:
# Models
from sklearn.ensemble import RandomForestRegressor

In [6]:
# Install the optuna library
!pip install optuna -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m368.6/395.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/242.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
#Hyperparameter optimization
import optuna

In [8]:
# Models Pipelines
from sklearn.pipeline import Pipeline

In [9]:
# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [12]:
# Save model
import joblib

In [13]:
!wget https://raw.githubusercontent.com/santiago2588/distillation_column_training/main/data/transformed_data.csv -O transformed_data.csv

--2025-07-07 13:53:20--  https://raw.githubusercontent.com/santiago2588/distillation_column_training/main/data/transformed_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12344 (12K) [text/plain]
Saving to: ‘transformed_data.csv’


2025-07-07 13:53:20 (16.2 MB/s) - ‘transformed_data.csv’ saved [12344/12344]



In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("transformed_data.csv")

# Display the first few rows to confirm it loaded correctly
df.head()

Unnamed: 0,PressureC1_diff,FlowC1,Temp1,Yield
0,0.0,432.0636,139.9857,69.400623
1,-9.9628,487.4029,131.047,66.532666
2,-0.0695,437.3516,118.2666,71.102193
3,0.2257,481.8314,118.1769,69.793481
4,-0.1678,412.6471,120.7891,71.489516


In [15]:
# Define features (X) and target variable (y)
X = df.drop('Yield', axis=1)  # Features (all columns except 'Job Offer')
y = df['Yield']  # Target variable

In [16]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (202, 3)
Testing data shape: (51, 3)


In [17]:
def get_metrics(y_true, y_pred):

    dict_metrics = {
        'MSE': mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2 Score': r2_score(y_true, y_pred),

    }
    return dict_metrics

In [18]:
# Creating pipeline with Random Forest
pip_model_rf = Pipeline(steps=[
    ('model', RandomForestRegressor(random_state=2023))
])

# Fit pipeline with PCA
pip_model_rf.fit(X_train, y_train)

# Generate Predictions using the correctly fitted pipeline
y_pred = pip_model_rf.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'MSE': 0.5588840302677203,
 'MAE': 0.5822903208509888,
 'R2 Score': 0.7260009601685249}

In [19]:
#Hyperparameters optimization with Optuna

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # Create a pipeline with the suggested hyperparameters
    pip_model_rf = Pipeline(steps=[
        ('model', RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=2023
        ))
    ])

    # Fit the pipeline
    pip_model_rf.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = pip_model_rf.predict(X_test)

    # Calculate the mean squared error
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Create an Optuna study
study = optuna.create_study(direction='minimize')  # Minimize MSE
study.optimize(objective, n_trials=50)  # Run 50 trials

# Get the best hyperparameters
best_params = study.best_params
print("Best Parameters:", best_params)


[I 2025-07-07 13:54:16,812] A new study created in memory with name: no-name-87f62cb2-3c4a-45d8-8904-f639321da9b3
[I 2025-07-07 13:54:17,440] Trial 0 finished with value: 0.8520682015286412 and parameters: {'n_estimators': 159, 'max_depth': 18, 'min_samples_split': 19, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8520682015286412.
[I 2025-07-07 13:54:17,564] Trial 1 finished with value: 1.0190996928209672 and parameters: {'n_estimators': 80, 'max_depth': 37, 'min_samples_split': 7, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8520682015286412.
[I 2025-07-07 13:54:17,659] Trial 2 finished with value: 0.645820019699594 and parameters: {'n_estimators': 53, 'max_depth': 11, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.645820019699594.
[I 2025-07-07 13:54:17,790] Trial 3 finished with value: 0.737174861150199 and parameters: {'n_estimators': 76, 'max_depth': 33, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.

Best Parameters: {'n_estimators': 298, 'max_depth': 41, 'min_samples_split': 3, 'min_samples_leaf': 1}


In [21]:
# Train the final model
final_model = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=2023
)
final_model.fit(X_train, y_train)

# Log metrics
y_pred = final_model.predict(X_test)
metrics = get_metrics(y_test, y_pred)

# Print metrics
print("Model Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value}")

# Print best parameters
print("\nBest Parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


Model Metrics:
MSE: 0.5378536099916119
MAE: 0.5706347047702236
R2 Score: 0.7363113549030922

Best Parameters:
n_estimators: 298
max_depth: 41
min_samples_split: 3
min_samples_leaf: 1


In [22]:
# Visualize the optimization history
optuna.visualization.plot_optimization_history(study).show()

In [23]:
# Visualize the parameter importance
optuna.visualization.plot_param_importances(study).show()