In [1]:
import yaml
import os

In [2]:
# Data wrangling
import pandas as pd
import numpy as np

In [3]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

In [4]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [5]:
import mlflow
import mlflow.sklearn

In [6]:
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("mining")

<Experiment: artifact_location=('file:c:/Users/mjkipsz2/OneDrive - The University of '
 'Manchester/Desktop/mining_dataset_training/notebooks/mlruns/180358635567956149'), creation_time=1755171885716, experiment_id='180358635567956149', last_update_time=1755171885716, lifecycle_stage='active', name='mining', tags={}>

In [7]:
# Save model
import joblib

In [8]:
# Load the configuration file
with open('../config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# Access the settings
project_folder = config['projectFolder']
df_path = os.path.join(project_folder, config['normalizedDataFile'])
df = pd.read_csv(df_path)

# Display the first few rows
df.head()

Unnamed: 0,% Iron Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 04 Air Flow,Flotation Column 05 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,-0.213027,0.275996,0.568826,0.227759,1.019824,-0.170499,-1.039839,-0.965738,-1.694665,1.808142,...,-1.422218,-0.44131,-0.268142,-0.598737,0.133238,0.265119,0.553233,0.801282,1.800787,1.11
1,-0.213027,0.435872,0.942004,0.157869,0.830227,0.359848,-1.036866,-0.796598,-1.694665,1.808142,...,-1.44699,-0.48827,-0.642799,-0.499185,0.123367,0.454182,0.166073,0.133461,1.720431,1.27
2,-0.213027,-0.114326,1.147271,1.213589,0.473401,0.636556,-1.073257,-0.843754,-1.694665,1.808142,...,-1.440884,-0.466888,-0.33494,-0.517902,0.136068,0.147856,-0.09978,-0.158767,1.524008,1.36
3,-0.213027,-0.393998,1.253941,0.066825,0.20019,1.138027,-1.051361,-1.031341,-1.694665,1.808142,...,-1.321186,0.290578,0.193901,0.135607,1.549147,1.392954,1.089841,1.095224,1.416868,1.34
4,-0.213027,0.393545,1.521794,0.236885,-0.051175,1.293576,-1.070655,-0.889838,-1.694665,1.808142,...,-1.368475,0.296688,0.084046,0.051523,1.293536,1.419185,1.6015,2.058241,1.613291,1.15


In [9]:
# Define features (X) and target variable (y)
X = df.drop('% Silica Concentrate', axis=1)  # Features (all columns except 'Job Offer')
y = df['% Silica Concentrate']  # Target variable

In [10]:
# Remove any columns that still have NaN values
X = X.select_dtypes(include=['number'])  # Keep only numeric columns
X = X.fillna(X.median())  # Final cleaning of any remaining NaN

In [11]:
# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Test set size: {X_test.shape}")
print("\n")

Training set size: (2747, 20)
Validation set size: (589, 20)
Test set size: (589, 20)




In [12]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(alpha=0.1),
    'ElasticNet Regression': ElasticNet(alpha=0.1, l1_ratio=0.5)}

In [13]:
# Train and evaluate each model
results = {}

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        print(f"Training {name}...")
        model.fit(X_train, y_train)

        # Predict on validation set
        y_val_pred = model.predict(X_val)

        # Calculate metrics
        val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        val_mae = mean_absolute_error(y_val, y_val_pred)
        val_r2 = r2_score(y_val, y_val_pred)

        # Log metrics to MLflow
        mlflow.log_metric("val_rmse", val_rmse)
        mlflow.log_metric("val_mae", val_mae)
        mlflow.log_metric("val_r2", val_r2)

        # Log the trained model
        mlflow.sklearn.log_model(model, artifact_path="model")

        results[name] = {
            'model': model,
            'val_rmse': val_rmse,
            'val_mae': val_mae,
            'val_r2': val_r2
        }

        print(f"{name} Validation Results:")
        print(f"  RMSE: {val_rmse:.4f}")
        print(f"  MAE: {val_mae:.4f}")
        print(f"  R²: {val_r2:.4f}")
        print()



Training Linear Regression...




Linear Regression Validation Results:
  RMSE: 0.6051
  MAE: 0.4814
  R²: 0.7161

Training Ridge Regression...




Ridge Regression Validation Results:
  RMSE: 0.6051
  MAE: 0.4814
  R²: 0.7161

Training Lasso Regression...




Lasso Regression Validation Results:
  RMSE: 0.6329
  MAE: 0.4872
  R²: 0.6894

Training ElasticNet Regression...




ElasticNet Regression Validation Results:
  RMSE: 0.6211
  MAE: 0.4830
  R²: 0.7009



In [14]:
# Find the best model based on RMSE
best_model_name = min(results, key=lambda k: results[k]['val_rmse'])
print(f"Best model based on validation RMSE: {best_model_name}")

Best model based on validation RMSE: Ridge Regression
