In [1]:
import yaml
import os

In [2]:
# Data wrangling
import pandas as pd
import numpy as np

In [3]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

In [4]:
# Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [5]:
# Models Pipelines
from sklearn.pipeline import Pipeline

In [6]:
# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [7]:
import mlflow
import mlflow.sklearn

In [8]:
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("mining")

<Experiment: artifact_location=('file:c:/Users/mjkipsz2/OneDrive - The University of '
 'Manchester/Desktop/mining_dataset_training/notebooks/mlruns/180358635567956149'), creation_time=1755171885716, experiment_id='180358635567956149', last_update_time=1755171885716, lifecycle_stage='active', name='mining', tags={}>

In [9]:
# Save model
import joblib

In [10]:
# Load the configuration file
with open('../config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# Access the settings
project_folder = config['projectFolder']
df_path = os.path.join(project_folder, config['transformedDataFile'])
df = pd.read_csv(df_path)

# Display the first few rows
df.head()

Unnamed: 0,% Iron Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 04 Air Flow,Flotation Column 05 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,55.2,3170.41,539.673,399.697,10.1589,1.6691,249.291,248.269,295.096,306.4,...,249.774,462.601,488.724,441.674,433.629,448.477,480.866,489.382,67.06,1.11
1,55.2,3365.65,573.517,399.023,10.086,1.70565,249.379,253.312,295.096,306.4,...,249.06,456.445,440.432,456.625,432.736,464.334,445.95,432.906,66.97,1.27
2,55.2,2693.75,592.133,409.204,9.9488,1.72472,248.302,251.906,295.096,306.4,...,249.236,459.248,480.114,453.814,433.885,438.642,421.974,408.193,66.75,1.36
3,55.2,2352.216,601.807,398.145,9.84375,1.75928,248.95,246.313,295.096,306.4,...,252.686,558.545,548.28,551.96,561.72,543.07,529.26,514.24,66.63,1.34
4,55.2,3313.96,626.099,399.785,9.7471,1.77,248.379,250.532,295.096,306.4,...,251.323,559.346,534.12,539.332,538.596,545.27,575.404,595.68,66.85,1.15


In [11]:
# Define features (X) and target variable (y)
X = df.drop('% Silica Concentrate', axis=1)  # Features (all columns except 'Job Offer')
y = df['% Silica Concentrate']  # Target variable

In [12]:
# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Test set size: {X_test.shape}")
print("\n")

Training set size: (2747, 20)
Validation set size: (589, 20)
Test set size: (589, 20)




In [13]:
# Initialize models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'LGBM Regressor': LGBMRegressor(random_state=42),
    'XGBoost Regressor': XGBRegressor(random_state=42)}

In [14]:
# Train and evaluate each model
results = {}

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        print(f"Training {name}...")
        model.fit(X_train, y_train)

        # Predict on validation set
        y_val_pred = model.predict(X_val)

        # Calculate metrics
        val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        val_mae = mean_absolute_error(y_val, y_val_pred)
        val_r2 = r2_score(y_val, y_val_pred)

        # Log metrics to MLflow
        mlflow.log_metric("val_rmse", val_rmse)
        mlflow.log_metric("val_mae", val_mae)
        mlflow.log_metric("val_r2", val_r2)

        # Log the trained model
        mlflow.sklearn.log_model(model, artifact_path="model")

        results[name] = {
            'model': model,
            'val_rmse': val_rmse,
            'val_mae': val_mae,
            'val_r2': val_r2
        }

        print(f"{name} Validation Results:")
        print(f"  RMSE: {val_rmse:.4f}")
        print(f"  MAE: {val_mae:.4f}")
        print(f"  R²: {val_r2:.4f}")
        print()

Training Random Forest...




Random Forest Validation Results:
  RMSE: 0.5305
  MAE: 0.3971
  R²: 0.7818

Training Gradient Boosting...




Gradient Boosting Validation Results:
  RMSE: 0.5360
  MAE: 0.4085
  R²: 0.7773

Training LGBM Regressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5073
[LightGBM] [Info] Number of data points in the train set: 2747, number of used features: 20
[LightGBM] [Info] Start training from score 2.327059




LGBM Regressor Validation Results:
  RMSE: 0.5121
  MAE: 0.3788
  R²: 0.7967

Training XGBoost Regressor...




XGBoost Regressor Validation Results:
  RMSE: 0.5480
  MAE: 0.4072
  R²: 0.7672



In [15]:
# Find the best model based on RMSE
best_model_name = min(results, key=lambda k: results[k]['val_rmse'])
print(f"Best model based on validation RMSE: {best_model_name}")
print("\n")

Best model based on validation RMSE: LGBM Regressor




In [16]:
best_model= results[best_model_name]['model']

In [17]:
# Save the model
model_filename = f'../model/mining_process_{best_model_name.lower().replace(" ", "_")}_model.pkl'
joblib.dump(best_model, model_filename)
print(f"Model saved as {model_filename}")

Model saved as ../model/mining_process_lgbm_regressor_model.pkl
