In [23]:
import yaml
import os

In [24]:
# Data wrangling
import pandas as pd
import numpy as np

In [25]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_class_weight

In [26]:
%run "C:/Users/mjkipsz2/OneDrive - The University of Manchester/Desktop/Pump failure/utils.py"

In [33]:
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline

In [35]:
import optuna
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score

In [41]:
import mlflow
import mlflow.sklearn
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("pump_failure_prediction")

2024/12/20 14:28:15 INFO mlflow.tracking.fluent: Experiment with name 'pump_failure_prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location=('file:///c:/Users/mjkipsz2/OneDrive - The University of '
 'Manchester/Desktop/Pump failure/notebook/mlruns/307286651761719792'), creation_time=1734704895170, experiment_id='307286651761719792', last_update_time=1734704895170, lifecycle_stage='active', name='pump_failure_prediction', tags={}>

In [28]:
# Load the configuration file
with open('../config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# Access the settings
project_folder = config['projectFolder']
df_path = os.path.join(project_folder, config['transformedDataFile'])
df = pd.read_csv(df_path)

# Display the first few rows
df.head()

Unnamed: 0,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Type_High,Type_Low,Type_Medium,Failure_type
0,-0.951417,-0.946356,0.067484,0.283054,-1.695647,0.0,0.0,1.0,No Failure
1,-0.901428,-0.878954,-0.729604,0.634238,-1.648511,0.0,1.0,0.0,No Failure
2,-0.951417,-1.013759,-0.22794,0.945286,-1.617087,0.0,1.0,0.0,No Failure
3,-0.901428,-0.946356,-0.590253,-0.048061,-1.585664,0.0,1.0,0.0,No Failure
4,-0.901428,-0.878954,-0.729604,0.002108,-1.55424,0.0,1.0,0.0,No Failure


In [29]:
# Define features (X) and target variable (y)
X = df.drop('Failure_type', axis=1)  # Features (all columns except 'Failure_type')
y = df['Failure_type']  # Target variable

In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (7978, 8)
Testing data shape: (1995, 8)


In [31]:
# Creating pipeline with random forest classifier and smote
random_forest_smote = Pipeline(steps=[
    ('smote', SMOTE(random_state=2023)),
    ('model', RandomForestClassifier(random_state=2023))
])

# Fit pipeline 
random_forest_smote.fit(X_train, y_train)

# Generate Predictions using the correctly fitted pipeline
y_pred = random_forest_smote.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics



{'Accuracy': 0.9679197994987468,
 'Balanced Accuracy': np.float64(0.7401661875673734),
 'Macro Recall': 0.7401661875673734,
 'Macro Precision': 0.5676455660952664,
 'Macro F1': 0.6203207367382119,
 'F1 Scores per Class': array([0.74509804, 0.98382046, 0.51851852, 0.79166667, 0.0625    ])}

In [None]:
# Creating model with random forest classifier and balanced class weights

# Compute sample weights for class imbalance
weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(random_state=2023)

# Fit the model with sample weights
rf_model.fit(X_train, y_train, sample_weight=weights)

# Generate predictions
y_pred = rf_model.predict(X_test)

# Evaluate metrics
metrics = get_metrics(y_test, y_pred)

# View results
metrics

{'Accuracy': 0.9869674185463659, 'Balanced Accuracy': np.float64(0.6019673014732303), 'Macro Recall': 0.6019673014732303, 'Macro Precision': 0.7430276147249908, 'Macro F1': 0.6592561840031659, 'F1 Scores per Class': array([0.8       , 0.99357822, 0.8       , 0.7027027 , 0.        ])}


In [51]:
# Define the objective function
def objective(trial):
    # Suggest hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 50, 300)  # Number of trees
    max_depth = trial.suggest_int('max_depth', 5, 50)  # Maximum depth of trees
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)  # Minimum samples to split
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)  # Minimum samples per leaf
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])  # Features to consider at each split

    # Create the pipeline with the suggested hyperparameters
    random_forest_class = Pipeline(steps=[
        ('model', RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=2023
        ))
    ])

    # Compute sample weights for imbalanced data
    weights = compute_sample_weight(class_weight='balanced', y=y_train)

    # Fit the model
    random_forest_class.fit(X_train, y_train, model__sample_weight=weights)

    # Generate predictions
    y_pred = random_forest_class.predict(X_test)

    # Evaluate using F1-score (or any other metric)
    f1 = f1_score(y_test, y_pred, average='macro')  # Macro F1-score for imbalanced data

    return f1  # Optuna will maximize this score

In [38]:
# Create a study object
study = optuna.create_study(direction='maximize')  # We want to maximize F1-score

# Optimize the study
study.optimize(objective, n_trials=20)  # Run 20 trials (you can increase this for better results)

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best F1-score:", study.best_value)

[I 2024-12-20 14:21:13,573] A new study created in memory with name: no-name-e19c82a7-eceb-4f23-a018-0c0d76506dd0
[I 2024-12-20 14:21:14,965] Trial 0 finished with value: 0.5889721296356367 and parameters: {'n_estimators': 173, 'max_depth': 30, 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': None}. Best is trial 0 with value: 0.5889721296356367.
[I 2024-12-20 14:21:15,593] Trial 1 finished with value: 0.5103395193144717 and parameters: {'n_estimators': 167, 'max_depth': 35, 'min_samples_split': 2, 'min_samples_leaf': 17, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5889721296356367.
[I 2024-12-20 14:21:16,903] Trial 2 finished with value: 0.5889721296356367 and parameters: {'n_estimators': 168, 'max_depth': 45, 'min_samples_split': 17, 'min_samples_leaf': 9, 'max_features': None}. Best is trial 0 with value: 0.5889721296356367.
[I 2024-12-20 14:21:17,840] Trial 3 finished with value: 0.5278526205355474 and parameters: {'n_estimators': 199, 'max_depth': 30, 'min

Best hyperparameters: {'n_estimators': 119, 'max_depth': 44, 'min_samples_split': 11, 'min_samples_leaf': 6, 'max_features': 'log2'}
Best F1-score: 0.6322743650231339


In [49]:
# Extract the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
final_model = Pipeline(steps=[
    ('model', RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        random_state=2023
    ))
])

# Compute sample weights
weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Start an MLflow run
with mlflow.start_run(run_name="rf_classification_run"):

    # Log the best hyperparameters
    mlflow.log_params(best_params)
    
    # Fit the final model
    final_model.fit(X_train, y_train, model__sample_weight=weights)
    
    # Generate predictions
    y_pred = final_model.predict(X_test)
    
    # Evaluate metrics
    metrics = get_metrics(y_test, y_pred)
    
    # Remove 'F1 Scores per Class' from metrics
    if 'F1 Scores per Class' in metrics:
        del metrics['F1 Scores per Class']
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Optionally, log the final model
    mlflow.sklearn.log_model(final_model, "random_forest_classifier")

# View results
metrics



{'Accuracy': 0.9689223057644111,
 'Balanced Accuracy': np.float64(0.8481279195113188),
 'Macro Recall': 0.8481279195113188,
 'Macro Precision': 0.5391545520593068,
 'Macro F1': 0.6322743650231339}