In [53]:
import yaml
import os

In [54]:
# Data wrangling
import pandas as pd
import numpy as np

In [55]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight

In [56]:
%run "C:/Users/mjkipsz2/OneDrive - The University of Manchester/Desktop/Pump failure/utils.py"

In [57]:
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline

In [58]:
import optuna
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score

In [59]:
import mlflow
import mlflow.sklearn
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("pump_failure_prediction")

<Experiment: artifact_location=('file:///c:/Users/mjkipsz2/OneDrive - The University of '
 'Manchester/Desktop/Pump failure/notebook/mlruns/307286651761719792'), creation_time=1734704895170, experiment_id='307286651761719792', last_update_time=1734704895170, lifecycle_stage='active', name='pump_failure_prediction', tags={}>

In [60]:
# Load the configuration file
with open('../config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# Access the settings
project_folder = config['projectFolder']
df_path = os.path.join(project_folder, config['transformedDataFile'])
df = pd.read_csv(df_path)

# Display the first few rows
df.head()

Unnamed: 0,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Type_High,Type_Low,Type_Medium,Failure_type
0,-0.951417,-0.946356,0.067484,0.283054,-1.695647,0.0,0.0,1.0,No Failure
1,-0.901428,-0.878954,-0.729604,0.634238,-1.648511,0.0,1.0,0.0,No Failure
2,-0.951417,-1.013759,-0.22794,0.945286,-1.617087,0.0,1.0,0.0,No Failure
3,-0.901428,-0.946356,-0.590253,-0.048061,-1.585664,0.0,1.0,0.0,No Failure
4,-0.901428,-0.878954,-0.729604,0.002108,-1.55424,0.0,1.0,0.0,No Failure


In [61]:
# Define features (X) and target variable (y)
X = df.drop('Failure_type', axis=1)  # Features (all columns except 'Failure_type')
y = df['Failure_type']  # Target variable

In [62]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (7978, 8)
Testing data shape: (1995, 8)


In [63]:
# Encode target labels as XGB requires numerical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [64]:
# Creating pipeline with xgboost classifier and smote
xg_boost_smote = Pipeline(steps=[
    ('smote', SMOTE(random_state=2023)),
    ('model', XGBClassifier(random_state=2023))
])

# Fit pipeline 
xg_boost_smote.fit(X_train, y_train_encoded)

# Generate Predictions using the correctly fitted pipeline
y_pred = xg_boost_smote.predict(X_test)

# Decode predictions back to original labels (optional)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Evaluate Metrics
metrics = get_metrics(y_test_encoded, y_pred)

# View Results
metrics



{'Accuracy': 0.9754385964912281,
 'Balanced Accuracy': np.float64(0.8494708947179304),
 'Macro Recall': 0.9754385964912281,
 'Macro Precision': 0.989242283791671,
 'Macro F1': 0.9815870281839686,
 'F1 Scores per Class': array([0.89795918, 0.98748044, 0.69565217, 0.85714286, 0.11428571])}

In [65]:
# Creating model with xgboost classifier and balanced class weights

# Compute sample weights for class imbalance
weights = compute_sample_weight(class_weight='balanced', y=y_train_encoded)

# Initialize XGBoost classifier
xgb_model = XGBClassifier(random_state=2023)

# Fit the model with sample weights
xgb_model.fit(X_train, y_train_encoded, sample_weight=weights)

# Generate predictions
y_pred = xgb_model.predict(X_test)

# Evaluate metrics
metrics = get_metrics(y_test_encoded, y_pred)

# View results
metrics

{'Accuracy': 0.9829573934837093,
 'Balanced Accuracy': np.float64(0.8574281351060007),
 'Macro Recall': 0.9829573934837093,
 'Macro Precision': 0.9890159415390738,
 'Macro F1': 0.9854568402093054,
 'F1 Scores per Class': array([0.88      , 0.9914308 , 0.66666667, 0.85106383, 0.28571429])}

In [66]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),  # Number of trees
        'max_depth': trial.suggest_int('max_depth', 3, 15),  # Maximum depth of trees
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),  # Learning rate
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Subsample ratio
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),  # Feature subsampling
        'gamma': trial.suggest_float('gamma', 0, 5),  # Minimum loss reduction
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),  # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),  # L2 regularization
    }

    # Initialize the XGBoost classifier with the trial's parameters
    model = XGBClassifier(random_state=2023, **params)


    # Compute sample weights for class imbalance
    weights = compute_sample_weight(class_weight='balanced', y=y_train_encoded)
    
    # Fit the model with sample weights
    model.fit(X_train, y_train_encoded, sample_weight=weights)

    # Generate predictions
    y_pred = model.predict(X_test)

    # Evaluate the model using F1 score (or any other metric)
    f1 = f1_score(y_test_encoded, y_pred, average='weighted')  # Weighted F1 score for imbalanced data

    return f1  # Optuna will maximize this score

In [67]:
# Create a study object
study = optuna.create_study(direction='maximize')  # We want to maximize F1-score

# Optimize the study
study.optimize(objective, n_trials=20)  # Run 20 trials (you can increase this for better results)

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best F1-score:", study.best_value)

[I 2024-12-20 15:15:17,845] A new study created in memory with name: no-name-6143c4bd-8429-4ad2-b3d1-2f418b624abb
[I 2024-12-20 15:15:19,039] Trial 0 finished with value: 0.9332163306517605 and parameters: {'n_estimators': 498, 'max_depth': 12, 'learning_rate': 0.017406459347193183, 'subsample': 0.9489754931155063, 'colsample_bytree': 0.7779877919310103, 'gamma': 1.091016386260999, 'reg_alpha': 7.299224651568444, 'reg_lambda': 6.720300571550903}. Best is trial 0 with value: 0.9332163306517605.
[I 2024-12-20 15:15:19,753] Trial 1 finished with value: 0.9579416756596505 and parameters: {'n_estimators': 407, 'max_depth': 5, 'learning_rate': 0.11401789946303593, 'subsample': 0.5473246083117472, 'colsample_bytree': 0.900536565899873, 'gamma': 1.8710561070448413, 'reg_alpha': 0.6304070300250664, 'reg_lambda': 5.008072504444762}. Best is trial 1 with value: 0.9579416756596505.
[I 2024-12-20 15:15:20,839] Trial 2 finished with value: 0.9147025708089696 and parameters: {'n_estimators': 390, 'ma

Best hyperparameters: {'n_estimators': 312, 'max_depth': 4, 'learning_rate': 0.1414622742715242, 'subsample': 0.5478805352631755, 'colsample_bytree': 0.9095844002818785, 'gamma': 1.6663756707018242, 'reg_alpha': 0.09488436911124098, 'reg_lambda': 3.431347316606015}
Best F1-score: 0.9611127694037038


In [68]:
# Extract the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
final_model = XGBClassifier(random_state=2023, **best_params)

# Compute sample weights again
weights = compute_sample_weight(class_weight='balanced', y=y_train_encoded)

# Start an MLflow run
with mlflow.start_run(run_name="xgboost_classification_run"):

    # Log the best hyperparameters
    mlflow.log_params(best_params)
    
    # Fit the final model
    final_model.fit(X_train, y_train_encoded, sample_weight=weights)
    
    # Generate predictions
    y_pred = final_model.predict(X_test)
    
    # Evaluate metrics
    metrics = get_metrics(y_test_encoded, y_pred)
    
    # Remove 'F1 Scores per Class' from metrics
    if 'F1 Scores per Class' in metrics:
        del metrics['F1 Scores per Class']
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Optionally, log the final model
    mlflow.sklearn.log_model(final_model, "xg_boost_classifier")

# View results
metrics



{'Accuracy': 0.9448621553884712,
 'Balanced Accuracy': np.float64(0.8916582824290333),
 'Macro Recall': 0.9448621553884712,
 'Macro Precision': 0.9836493899640406,
 'Macro F1': 0.9611127694037038}