<a href="https://colab.research.google.com/github/santiago2588/Pump_failure_training/blob/main/soluciones/04_random_forest_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data wrangling
import pandas as pd
import numpy as np

In [2]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_class_weight

In [3]:
# Download and run the utils.py file from your GitHub repository
!wget https://raw.githubusercontent.com/santiago2588/pump_failure_training/main/utils.py
%run utils.py

--2025-07-15 10:24:57--  https://raw.githubusercontent.com/santiago2588/pump_failure_training/main/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2407 (2.4K) [text/plain]
Saving to: ‘utils.py.2’


2025-07-15 10:24:57 (20.9 MB/s) - ‘utils.py.2’ saved [2407/2407]



In [4]:
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline

In [5]:
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score

In [6]:
%pip install optuna -q

In [7]:
import optuna

In [8]:
!wget https://raw.githubusercontent.com/santiago2588/pump_failure_training/main/data/transformed_data.csv -O transformed_data.csv

--2025-07-15 10:25:12--  https://raw.githubusercontent.com/santiago2588/pump_failure_training/main/data/transformed_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1210184 (1.2M) [text/plain]
Saving to: ‘transformed_data.csv’


2025-07-15 10:25:13 (18.1 MB/s) - ‘transformed_data.csv’ saved [1210184/1210184]



In [9]:
# Read the CSV file into a DataFrame
df = pd.read_csv("transformed_data.csv")

# Display the first few rows to confirm it loaded correctly
df.head()

Unnamed: 0,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Type_High,Type_Low,Type_Medium,Failure_type
0,-0.951417,-0.946356,0.067484,0.283054,-1.695647,0.0,0.0,1.0,No Failure
1,-0.901428,-0.878954,-0.729604,0.634238,-1.648511,0.0,1.0,0.0,No Failure
2,-0.951417,-1.013759,-0.22794,0.945286,-1.617087,0.0,1.0,0.0,No Failure
3,-0.901428,-0.946356,-0.590253,-0.048061,-1.585664,0.0,1.0,0.0,No Failure
4,-0.901428,-0.878954,-0.729604,0.002108,-1.55424,0.0,1.0,0.0,No Failure


In [10]:
# Define features (X) and target variable (y)
X = df.drop('Failure_type', axis=1)  # Features (all columns except 'Failure_type')
y = df['Failure_type']  # Target variable

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (7978, 8)
Testing data shape: (1995, 8)


In [12]:
# Creating pipeline with random forest classifier and smote
random_forest_smote = Pipeline(steps=[
    ('smote', SMOTE(random_state=2023)),
    ('model', RandomForestClassifier(random_state=2023))
])

# Fit pipeline
random_forest_smote.fit(X_train, y_train)

# Generate Predictions using the correctly fitted pipeline
y_pred = random_forest_smote.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'Accuracy': 0.9679197994987468,
 'Balanced Accuracy': np.float64(0.7401661875673734),
 'Macro Recall': 0.9679197994987468,
 'Macro Precision': 0.9829511924575601,
 'Macro F1': 0.9746780178291959,
 'F1 Scores per Class': array([0.74509804, 0.98382046, 0.51851852, 0.79166667, 0.0625    ])}

In [13]:
# Creating model with random forest classifier and balanced class weights

# Compute sample weights for class imbalance
weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(random_state=2023)

# Fit the model with sample weights
rf_model.fit(X_train, y_train, sample_weight=weights)

# Generate predictions
y_pred = rf_model.predict(X_test)

# Evaluate metrics
metrics = get_metrics(y_test, y_pred)

# View results
metrics

{'Accuracy': 0.9844611528822055,
 'Balanced Accuracy': np.float64(0.5421846927775782),
 'Macro Recall': 0.9844611528822055,
 'Macro Precision': 0.9812659793000553,
 'Macro F1': 0.9816089386055862,
 'F1 Scores per Class': array([0.7027027 , 0.99230375, 0.71428571, 0.66666667, 0.        ])}

In [14]:
# Define the objective function
def objective(trial):
    # Suggest hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 50, 300)  # Number of trees
    max_depth = trial.suggest_int('max_depth', 5, 50)  # Maximum depth of trees
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)  # Minimum samples to split
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)  # Minimum samples per leaf
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])  # Features to consider at each split

    # Create the pipeline with the suggested hyperparameters
    random_forest_class = Pipeline(steps=[
        ('model', RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=2023
        ))
    ])

    # Compute sample weights for imbalanced data
    weights = compute_sample_weight(class_weight='balanced', y=y_train)

    # Fit the model
    random_forest_class.fit(X_train, y_train, model__sample_weight=weights)

    # Generate predictions
    y_pred = random_forest_class.predict(X_test)

    # Evaluate using F1-score (or any other metric)
    f1 = f1_score(y_test, y_pred, average='macro')  # Macro F1-score for imbalanced data

    return f1  # Optuna will maximize this score

In [15]:
# Create a study object
study = optuna.create_study(direction='maximize')  # We want to maximize F1-score

# Optimize the study
study.optimize(objective, n_trials=20)  # Run 20 trials (you can increase this for better results)

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best F1-score:", study.best_value)

[I 2025-07-15 10:25:31,499] A new study created in memory with name: no-name-55d50d7f-557e-4a4a-bf15-09ce4cd82786
[I 2025-07-15 10:25:34,780] Trial 0 finished with value: 0.6577082596358783 and parameters: {'n_estimators': 75, 'max_depth': 29, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 0 with value: 0.6577082596358783.
[I 2025-07-15 10:25:35,642] Trial 1 finished with value: 0.5224731074465734 and parameters: {'n_estimators': 95, 'max_depth': 9, 'min_samples_split': 17, 'min_samples_leaf': 15, 'max_features': 'log2'}. Best is trial 0 with value: 0.6577082596358783.
[I 2025-07-15 10:25:36,663] Trial 2 finished with value: 0.558995503864985 and parameters: {'n_estimators': 128, 'max_depth': 8, 'min_samples_split': 20, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 0 with value: 0.6577082596358783.
[I 2025-07-15 10:25:38,515] Trial 3 finished with value: 0.545539457834654 and parameters: {'n_estimators': 244, 'max_depth': 49, 'min_sa

Best hyperparameters: {'n_estimators': 157, 'max_depth': 44, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': None}
Best F1-score: 0.6613071750488667


In [16]:
# Extract the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
final_model = Pipeline(steps=[
    ('model', RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        random_state=2023
    ))
])

# Compute sample weights
weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Fit the final model
final_model.fit(X_train, y_train, model__sample_weight=weights)

# Generate predictions
y_pred = final_model.predict(X_test)

# Evaluate metrics
metrics = get_metrics(y_test, y_pred)

# Remove 'F1 Scores per Class' from metrics
if 'F1 Scores per Class' in metrics:
    del metrics['F1 Scores per Class']

# View results
metrics

{'Accuracy': 0.9859649122807017,
 'Balanced Accuracy': np.float64(0.7125808480057492),
 'Macro Recall': 0.9859649122807017,
 'Macro Precision': 0.9855378660481174,
 'Macro F1': 0.9855413079473047}