In [41]:
import yaml
import os

In [42]:
# Data wrangling
import pandas as pd
import numpy as np

In [43]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

In [44]:
# Models
from sklearn.ensemble import GradientBoostingRegressor

In [45]:
# Models Pipelines
from sklearn.pipeline import Pipeline

In [46]:
#Hyperparameter optimization
import optuna

In [47]:
# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [48]:
import mlflow
import mlflow.sklearn

In [49]:
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("distillation_tower")

<Experiment: artifact_location=('file:///c:/Users/mjkipsz2/OneDrive - The University of '
 'Manchester/Desktop/distillation_colum_yield/notebooks/mlruns/550582570200691247'), creation_time=1736352237514, experiment_id='550582570200691247', last_update_time=1736352237514, lifecycle_stage='active', name='distillation_tower', tags={}>

In [50]:
# Save model
import joblib

In [51]:
# Load the configuration file
with open('../config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# Access the settings
project_folder = config['projectFolder']
df_path = os.path.join(project_folder, config['transformedDataFile'])
df = pd.read_csv(df_path)

# Display the first few rows
df.head()

Unnamed: 0,PressureC1_diff,FlowC1,Temp1,Yield
0,0.0,432.0636,139.9857,69.400623
1,-9.9628,487.4029,131.047,66.532666
2,-0.0695,437.3516,118.2666,71.102193
3,0.2257,481.8314,118.1769,69.793481
4,-0.1678,412.6471,120.7891,71.489516


In [52]:
# Define features (X) and target variable (y)
X = df.drop('Yield', axis=1)  # Features (all columns except 'Job Offer')
y = df['Yield']  # Target variable

In [53]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (202, 3)
Testing data shape: (51, 3)


In [54]:
def get_metrics(y_true, y_pred):

    dict_metrics = {
        'MSE': mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2 Score': r2_score(y_true, y_pred),
        
    }
    return dict_metrics

In [55]:
# Creating pipeline with Boosted Trees
pip_model_bt = Pipeline(steps=[
    ('model', GradientBoostingRegressor(random_state=2023))
])

# Fit pipeline
pip_model_bt.fit(X_train, y_train)

# Generate Predictions using the correctly fitted pipeline
y_pred = pip_model_bt.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'MSE': 0.5607937224219373,
 'MAE': 0.554537030979557,
 'R2 Score': 0.7250647125960571}

In [56]:
#Optimize hyperparameters with Optuna

# Define the objective function
def objective(trial):
    # Suggest hyperparameters to tune
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 2, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)

    # Create the pipeline with the suggested hyperparameters
    pip_model_bt = Pipeline(steps=[
        ('model', GradientBoostingRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=2023
        ))
    ])

    # Fit the pipeline
    pip_model_bt.fit(X_train, y_train)

    # Predict on the test set
    y_pred = pip_model_bt.predict(X_test)

    # Calculate the evaluation metric (MSE in this case)
    mse = mean_squared_error(y_test, y_pred)
    return mse  # Optuna minimizes the objective function by default

# Create a study object
study = optuna.create_study(direction="minimize")  # We want to minimize MSE

# Optimize the study
study.optimize(objective, n_trials=50)  # Run 50 trials (you can increase this for better results)

# Print the best hyperparameters
best_params = study.best_params
print("Best hyperparameters:", study.best_params)

[I 2025-01-09 16:07:19,798] A new study created in memory with name: no-name-62019d2f-ce3d-496a-88ee-2554675d2d85
[I 2025-01-09 16:07:19,971] Trial 0 finished with value: 0.663999864126606 and parameters: {'n_estimators': 253, 'max_depth': 3, 'learning_rate': 0.08048749212128051, 'subsample': 0.8656064018113124, 'min_samples_split': 13, 'min_samples_leaf': 14}. Best is trial 0 with value: 0.663999864126606.
[I 2025-01-09 16:07:20,114] Trial 1 finished with value: 0.7275764291272345 and parameters: {'n_estimators': 228, 'max_depth': 3, 'learning_rate': 0.0150007964781999, 'subsample': 0.9075488927550652, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.663999864126606.
[I 2025-01-09 16:07:20,220] Trial 2 finished with value: 0.9294199078669626 and parameters: {'n_estimators': 176, 'max_depth': 3, 'learning_rate': 0.013955900394825966, 'subsample': 0.7793362841737754, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.66399986412660

Best hyperparameters: {'n_estimators': 478, 'max_depth': 5, 'learning_rate': 0.08774254593744789, 'subsample': 0.813261893309714, 'min_samples_split': 13, 'min_samples_leaf': 2}


In [57]:
# Start an MLflow run with a specific run name
with mlflow.start_run(run_name="gb_regression_run"):
    # Train the final model
    final_model = GradientBoostingRegressor(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        learning_rate=best_params["learning_rate"],
        subsample=best_params["subsample"],
        min_samples_split=best_params["min_samples_split"],
        min_samples_leaf=best_params["min_samples_leaf"],
        random_state=2023
    )

    # Fit the final model
    final_model.fit(X_train, y_train)

    # Add model signature and input example
    signature = mlflow.models.infer_signature(X_train, y_train)
    input_example = X_train.iloc[:5]

    # Log metrics
    y_pred = final_model.predict(X_test)
    metrics = get_metrics(y_test, y_pred)
    mlflow.log_metrics(metrics)

    # Log parameters
    mlflow.log_params(best_params)

    # Log the model with signature and input example
    mlflow.sklearn.log_model(
        final_model, 
        "gradient_boosting_model",
        signature=signature,
        input_example=input_example
    )

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [58]:
# Visualize the optimization history
optuna.visualization.plot_optimization_history(study).show()

In [59]:
# Visualize the parameter importance
optuna.visualization.plot_param_importances(study).show()

In [60]:
joblib.dump(final_model, 'C:/Users/mjkipsz2/OneDrive - The University of Manchester/Desktop/distillation_colum_yield/model/final_model.joblib')

['C:/Users/mjkipsz2/OneDrive - The University of Manchester/Desktop/distillation_colum_yield/model/final_model.joblib']