In [1]:
import yaml
import os

In [2]:
# Data wrangling
import pandas as pd
import numpy as np

In [3]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

In [4]:
# Models
from xgboost import XGBRegressor

In [5]:
# Models Pipelines
from sklearn.pipeline import Pipeline

In [6]:
#Hyperparameter optimization
import optuna

In [7]:
# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [8]:
import mlflow
import mlflow.sklearn

In [9]:
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("distillation_tower")

<Experiment: artifact_location=('file:///c:/Users/mjkipsz2/OneDrive - The University of '
 'Manchester/Desktop/distillation_colum_yield/notebooks/mlruns/550582570200691247'), creation_time=1736352237514, experiment_id='550582570200691247', last_update_time=1736352237514, lifecycle_stage='active', name='distillation_tower', tags={}>

In [10]:
# Save model
import joblib

In [11]:
# Load the configuration file
with open('../config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# Access the settings
project_folder = config['projectFolder']
df_path = os.path.join(project_folder, config['transformedDataFile'])
df = pd.read_csv(df_path)

# Display the first few rows
df.head()

Unnamed: 0,PressureC1_diff,FlowC1,Temp1,Yield
0,0.0,432.0636,139.9857,69.400623
1,-9.9628,487.4029,131.047,66.532666
2,-0.0695,437.3516,118.2666,71.102193
3,0.2257,481.8314,118.1769,69.793481
4,-0.1678,412.6471,120.7891,71.489516


In [12]:
# Define features (X) and target variable (y)
X = df.drop('Yield', axis=1)  # Features (all columns except 'Job Offer')
y = df['Yield']  # Target variable

In [13]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (202, 3)
Testing data shape: (51, 3)


In [14]:
def get_metrics(y_true, y_pred):

    dict_metrics = {
        'MSE': mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2 Score': r2_score(y_true, y_pred),
        
    }
    return dict_metrics

In [15]:
# Create model with XGBRegressor
pip_model_xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5)

# Fit pipeline
pip_model_xgb.fit(X_train, y_train)

# Generate predictions
y_pred = pip_model_xgb.predict(X_test)

# Evaluate metrics
metrics = get_metrics(y_test, y_pred)

# View results
print(metrics)

{'MSE': 0.5043979371243295, 'MAE': 0.5494068029021147, 'R2 Score': 0.7527133663152991}


In [16]:
#Hyperparameter optimization with Optuna

# Define the objective function
def objective(trial):
    # Define the hyperparameter search space
    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",  # For regression tasks
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "eta": trial.suggest_float("eta", 1e-3, 0.3, log=True),
        "gamma": trial.suggest_float("gamma", 1e-3, 10.0, log=True),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }
    
    # Create and train model
    model = XGBRegressor(**param)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate error (you can change this to other metrics)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Create and run study
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

# Add this to suppress Optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

# When optimizing, disable progress bar if desired
study.optimize(objective, n_trials=50, show_progress_bar=False)

# Train final model with best parameters
best_params = study.best_trial.params
print("Best hyperparameters:", study.best_params)

[I 2025-01-09 15:42:59,751] A new study created in memory with name: no-name-a4ca711a-ba74-4a6a-a906-b99c42e0c220


Best hyperparameters: {'booster': 'dart', 'lambda': 0.3406402858996535, 'alpha': 0.027385313541678342, 'max_depth': 8, 'eta': 0.10512407216536185, 'gamma': 0.28213315233843356, 'grow_policy': 'depthwise', 'subsample': 0.5307212143780551, 'colsample_bytree': 0.9683698056343071, 'min_child_weight': 4}


In [17]:
# Start an MLflow run with a specific run name
with mlflow.start_run(run_name="xgb_regression_run"):
    # Train the final model
    final_model = XGBRegressor(**best_params)
    final_model.fit(X_train, y_train)

    # Add model signature and input example
    signature = mlflow.models.infer_signature(X_train, y_train)
    input_example = X_train.iloc[:5]

    # Log metrics
    y_pred = final_model.predict(X_test)
    metrics = get_metrics(y_test, y_pred)
    mlflow.log_metrics(metrics)

    # Log parameters
    mlflow.log_params(best_params)

    # Log the model with signature and input example
    mlflow.sklearn.log_model(
        final_model, 
        "xg_boost_model",
        signature=signature,
        input_example=input_example
    )

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [18]:
# Visualize the optimization history
optuna.visualization.plot_optimization_history(study).show()

In [19]:
# Visualize parameter importance
optuna.visualization.plot_param_importances(study).show()

In [20]:
#joblib.dump(tuned_model, 'CIC_hybrid_no_outliers_Norsok_scikit.joblib')