<a href="https://colab.research.google.com/github/santiago2588/distillation_column_training/blob/main/Soluciones_colab/06_lgbm_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data wrangling
import pandas as pd
import numpy as np

In [2]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

In [3]:
# Models
from lightgbm import LGBMRegressor

In [4]:
# Models Pipelines
from sklearn.pipeline import Pipeline

In [5]:
# Install the optuna library
!pip install optuna -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/242.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
#Hyperparameter optimization
import optuna

In [7]:
# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [8]:
# Save model
import joblib

In [9]:
!wget https://raw.githubusercontent.com/santiago2588/distillation_column_training/main/data/transformed_data.csv -O transformed_data.csv

--2025-07-07 14:02:57--  https://raw.githubusercontent.com/santiago2588/distillation_column_training/main/data/transformed_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12344 (12K) [text/plain]
Saving to: ‘transformed_data.csv’


2025-07-07 14:02:57 (82.6 MB/s) - ‘transformed_data.csv’ saved [12344/12344]



In [10]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("transformed_data.csv")

# Display the first few rows to confirm it loaded correctly
df.head()

Unnamed: 0,PressureC1_diff,FlowC1,Temp1,Yield
0,0.0,432.0636,139.9857,69.400623
1,-9.9628,487.4029,131.047,66.532666
2,-0.0695,437.3516,118.2666,71.102193
3,0.2257,481.8314,118.1769,69.793481
4,-0.1678,412.6471,120.7891,71.489516


In [11]:
# Define features (X) and target variable (y)
X = df.drop('Yield', axis=1)  # Features (all columns except 'Job Offer')
y = df['Yield']  # Target variable

In [12]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (202, 3)
Testing data shape: (51, 3)


In [13]:
def get_metrics(y_true, y_pred):

    dict_metrics = {
        'MSE': mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2 Score': r2_score(y_true, y_pred),

    }
    return dict_metrics

In [14]:
# Creating pipeline with LGBM Regressor
pip_model_lgbm = Pipeline(steps=[
    ('model', LGBMRegressor(random_state=2023,force_col_wise=True))
])

# Fit pipeline
pip_model_lgbm.fit(X_train, y_train)

# Generate Predictions using the correctly fitted pipeline
y_pred = pip_model_lgbm.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 202, number of used features: 3
[LightGBM] [Info] Start training from score 69.521861


{'MSE': 0.8304275732374204,
 'MAE': 0.6985192486419486,
 'R2 Score': 0.5928737530617233}

In [15]:
#Hyperparameter optimization with Optuna

# Define the objective function
def objective(trial):
    # Define the hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 50),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42,
        'force_col_wise': True
    }

    # Create and train model
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate error (you can change this to other metrics)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Create and run study
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

# Add this to suppress Optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

# When optimizing, disable progress bar if desired
study.optimize(objective, n_trials=50, show_progress_bar=False)

# Train final model with best parameters
best_params = study.best_trial.params
print("Best hyperparameters:", study.best_params)

[I 2025-07-07 14:03:23,014] A new study created in memory with name: no-name-0c8e4e43-0620-4fde-9d51-22213e7e10d0


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 202, number of used features: 3
[LightGBM] [Info] Start training from score 69.521861
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 202, number of used features: 3
[LightGBM] [Info] Start training from score 69.521861
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 202, number of used features: 3
[LightGBM] [Info] Start training from score 69.521861
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 202, number of used features: 3
[LightGBM] [Info] Start training from score 69.521861
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 202, number of used features: 3
[LightGBM] [Info] Start training from score 69.521861
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info]

In [None]:
# Train final model with best parameters
final_model = LGBMRegressor(**best_params)

final_model.fit(X_train, y_train)

# Log metrics
y_pred = final_model.predict(X_test)
metrics = get_metrics(y_test, y_pred)

# View Results
"Metrics:", metrics
"Best hyperparameters:", best_params

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 202, number of used features: 3
[LightGBM] [Info] Start training from score 69.521861
Metrics: {'MSE': 0.4362450569448629, 'MAE': 0.5171736592632078, 'R2 Score': 0.7861260650499152}
Best hyperparameters: {'n_estimators': 649, 'max_depth': 6, 'learning_rate': 0.15824919610227742, 'num_leaves': 56, 'min_child_samples': 3, 'subsample': 0.6672067440050428, 'colsample_bytree': 0.9081366331108418}


In [17]:
# Visualize the optimization history
optuna.visualization.plot_optimization_history(study).show()

In [18]:
# Visualize parameter importance
optuna.visualization.plot_param_importances(study).show()