<a href="https://colab.research.google.com/github/santiago2588/distillation_column_training/blob/main/Soluciones/05_gradient_boosting_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data wrangling
import pandas as pd
import numpy as np

In [2]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

In [3]:
# Models
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
# Models Pipelines
from sklearn.pipeline import Pipeline

In [5]:
# Install the optuna library
!pip install optuna -q

In [6]:
#Hyperparameter optimization
import optuna

In [7]:
# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [8]:
# Save model
import joblib

In [9]:
!wget https://raw.githubusercontent.com/santiago2588/distillation_column_training/main/data/transformed_data.csv -O transformed_data.csv

--2025-07-08 10:00:19--  https://raw.githubusercontent.com/santiago2588/distillation_column_training/main/data/transformed_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12344 (12K) [text/plain]
Saving to: ‘transformed_data.csv’


2025-07-08 10:00:19 (18.0 MB/s) - ‘transformed_data.csv’ saved [12344/12344]



In [10]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("transformed_data.csv")

# Display the first few rows to confirm it loaded correctly
df.head()

Unnamed: 0,PressureC1_diff,FlowC1,Temp1,Yield
0,0.0,432.0636,139.9857,69.400623
1,-9.9628,487.4029,131.047,66.532666
2,-0.0695,437.3516,118.2666,71.102193
3,0.2257,481.8314,118.1769,69.793481
4,-0.1678,412.6471,120.7891,71.489516


In [11]:
# Define features (X) and target variable (y)
X = df.drop('Yield', axis=1)  # Features (all columns except 'Job Offer')
y = df['Yield']  # Target variable

In [12]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (202, 3)
Testing data shape: (51, 3)


In [13]:
def get_metrics(y_true, y_pred):

    dict_metrics = {
        'MSE': mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2 Score': r2_score(y_true, y_pred),

    }
    return dict_metrics

In [14]:
# Creating pipeline with Boosted Trees
pip_model_bt = Pipeline(steps=[
    ('model', GradientBoostingRegressor(random_state=2023))
])

# Fit pipeline
pip_model_bt.fit(X_train, y_train)

# Generate Predictions using the correctly fitted pipeline
y_pred = pip_model_bt.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'MSE': 0.5607937224219373,
 'MAE': 0.554537030979557,
 'R2 Score': 0.7250647125960571}

In [15]:
#Optimize hyperparameters with Optuna

# Define the objective function
def objective(trial):
    # Suggest hyperparameters to tune
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 2, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)

    # Create the pipeline with the suggested hyperparameters
    pip_model_bt = Pipeline(steps=[
        ('model', GradientBoostingRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=2023
        ))
    ])

    # Fit the pipeline
    pip_model_bt.fit(X_train, y_train)

    # Predict on the test set
    y_pred = pip_model_bt.predict(X_test)

    # Calculate the evaluation metric (MSE in this case)
    mse = mean_squared_error(y_test, y_pred)
    return mse  # Optuna minimizes the objective function by default

# Create a study object
study = optuna.create_study(direction="minimize")  # We want to minimize MSE

# Optimize the study
study.optimize(objective, n_trials=50)  # Run 50 trials (you can increase this for better results)

# Print the best hyperparameters
best_params = study.best_params
print("Best hyperparameters:", study.best_params)

[I 2025-07-08 10:00:20,053] A new study created in memory with name: no-name-178e0a0a-b293-42d5-ba68-6a0d087020e1
[I 2025-07-08 10:00:20,723] Trial 0 finished with value: 0.6542110087952264 and parameters: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.1115991825252721, 'subsample': 0.9345981942740367, 'min_samples_split': 13, 'min_samples_leaf': 18}. Best is trial 0 with value: 0.6542110087952264.
[I 2025-07-08 10:00:21,273] Trial 1 finished with value: 0.7003030386729351 and parameters: {'n_estimators': 333, 'max_depth': 3, 'learning_rate': 0.05778728715461497, 'subsample': 0.9297723063138956, 'min_samples_split': 5, 'min_samples_leaf': 15}. Best is trial 0 with value: 0.6542110087952264.
[I 2025-07-08 10:00:21,962] Trial 2 finished with value: 0.4572427090202615 and parameters: {'n_estimators': 454, 'max_depth': 7, 'learning_rate': 0.15306725274419578, 'subsample': 0.5053794003384944, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.45724270902

Best hyperparameters: {'n_estimators': 236, 'max_depth': 7, 'learning_rate': 0.17033817772431828, 'subsample': 0.8526859258805682, 'min_samples_split': 14, 'min_samples_leaf': 3}


In [16]:
# Train the final model
final_model = GradientBoostingRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    random_state=2023
)

# Fit the final model
final_model.fit(X_train, y_train)

# Log metrics
y_pred = final_model.predict(X_test)
metrics = get_metrics(y_test, y_pred)

# Print metrics
print("Model Metrics:")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value}")

# Print best parameters
print("\nBest Hyperparameters:")
for param_name, param_value in best_params.items():
    print(f"{param_name}: {param_value}")

Model Metrics:
MSE: 0.3922943377911464
MAE: 0.4934977796945559
R2 Score: 0.8076733882794813

Best Hyperparameters:
n_estimators: 236
max_depth: 7
learning_rate: 0.17033817772431828
subsample: 0.8526859258805682
min_samples_split: 14
min_samples_leaf: 3


In [17]:
# Visualize the optimization history
optuna.visualization.plot_optimization_history(study).show()

In [18]:
# Visualize the parameter importance
optuna.visualization.plot_param_importances(study).show()

In [19]:
import ipywidgets as widgets
from IPython.display import display, Image, HTML

In [20]:
# Create Interactive Widgets
# ------------------------------------------
flowrate = widgets.IntSlider(
    description='Flowrate [m3/s]:',
    min=100, max=500, value=300, step=1, style={'description_width': 'initial'}
)
temperature = widgets.IntSlider(
    description='Temperature [C]:',
    min=100, max=200, value=130, step=1, style={'description_width': 'initial'}
)
pressure = widgets.IntSlider(
    description='Pressure diff [psi]:',
    min=-50, max=50, value=0, step=1, style={'description_width': 'initial'}
)
predict_button = widgets.Button(
    description='Predict Yield',
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    icon='calculator'
)
# An output widget to display the results
output = widgets.Output()

In [21]:
# 3. Define the Prediction Logic
# ------------------------------------------
def run_prediction(pressure_val, flowrate_val, temperature_val):
    """Creates a DataFrame and runs the model prediction."""
    df_input = pd.DataFrame({
        'PressureC1_diff': [pressure_val],
        'FlowC1': [flowrate_val],
        'Temp1': [temperature_val]
    })
    prediction_result = final_model.predict(df_input)
    return prediction_result[0]


In [22]:
# Define what happens when the button is clicked
def on_predict_clicked(b):
    with output:
        output.clear_output() # Clear previous results
        result = run_prediction(pressure.value, flowrate.value, temperature.value)
        print(f"Predicted Yield: {result:.2f}")

In [23]:
# Link the button click event to the function
predict_button.on_click(on_predict_clicked)

In [24]:
# 4. Display the Widgets
# ------------------------------------------
# Arrange all widgets in a vertical box and display them
# Page title and image

display(HTML('<h1>Distillation column yield prediction</h1>'))

display(HTML("""
<p>This app aims to assist in predicting the yield in distillation columns by adjusting the sliders and clicking predict.</p>
"""))

ui = widgets.VBox([flowrate, temperature, pressure, predict_button, output])
display(ui)

VBox(children=(IntSlider(value=300, description='Flowrate [m3/s]:', max=500, min=100, style=SliderStyle(descri…