<a href="https://colab.research.google.com/github/santiago2588/Pump_failure_training/blob/main/soluciones/03_baseline_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data wrangling
import pandas as pd
import numpy as np

In [2]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE

In [3]:
# Download the utils.py file from your GitHub repository
!wget https://raw.githubusercontent.com/santiago2588/pump_failure_training/main/utils.py

--2025-07-15 10:05:49--  https://raw.githubusercontent.com/santiago2588/pump_failure_training/main/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2407 (2.4K) [text/plain]
Saving to: ‘utils.py’


2025-07-15 10:05:49 (4.65 MB/s) - ‘utils.py’ saved [2407/2407]



In [4]:
# Now you can run the script using %run
%run utils.py

In [5]:
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline

In [6]:
!wget https://raw.githubusercontent.com/santiago2588/pump_failure_training/main/data/transformed_data.csv -O transformed_data.csv

--2025-07-15 10:06:07--  https://raw.githubusercontent.com/santiago2588/pump_failure_training/main/data/transformed_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1210184 (1.2M) [text/plain]
Saving to: ‘transformed_data.csv’


2025-07-15 10:06:07 (24.2 MB/s) - ‘transformed_data.csv’ saved [1210184/1210184]



In [7]:
# Read the CSV file into a DataFrame
df = pd.read_csv("transformed_data.csv")

# Display the first few rows to confirm it loaded correctly
df.head()

Unnamed: 0,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Type_High,Type_Low,Type_Medium,Failure_type
0,-0.951417,-0.946356,0.067484,0.283054,-1.695647,0.0,0.0,1.0,No Failure
1,-0.901428,-0.878954,-0.729604,0.634238,-1.648511,0.0,1.0,0.0,No Failure
2,-0.951417,-1.013759,-0.22794,0.945286,-1.617087,0.0,1.0,0.0,No Failure
3,-0.901428,-0.946356,-0.590253,-0.048061,-1.585664,0.0,1.0,0.0,No Failure
4,-0.901428,-0.878954,-0.729604,0.002108,-1.55424,0.0,1.0,0.0,No Failure


In [8]:
# Define features (X) and target variable (y)
X = df.drop('Failure_type', axis=1)  # Features (all columns except 'Failure_type')
y = df['Failure_type']  # Target variable

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (7978, 8)
Testing data shape: (1995, 8)


In [10]:
# Define the pipeline for Logistic regression, PCA and class weight
log_reg_pca_class = Pipeline(steps=[
    ('pca', PCA(n_components=4)),  # PCA step
    ('model', LogisticRegression(random_state=2023))  # Logistic Regression step
])

# Fit the pipeline with balanced class weights
weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)
log_reg_pca_class.fit(X_train, y_train, model__sample_weight=weights)

# Generate predictions
y_pred = log_reg_pca_class.predict(X_test)

# Evaluate metrics
metrics = get_metrics(y_test, y_pred)

# View results
metrics

{'Accuracy': 0.48521303258145365,
 'Balanced Accuracy': np.float64(0.6057105641394178),
 'Macro Recall': 0.48521303258145365,
 'Macro Precision': 0.957444554009486,
 'Macro F1': 0.6322638089216059,
 'F1 Scores per Class': array([0.08947368, 0.64911673, 0.26415094, 0.01219512, 0.04494382])}

In [11]:
# Define the pipeline for Logistic regression and class weight

log_reg_class = Pipeline(steps=[
    ('model', LogisticRegression(random_state=2023))  # Logistic Regression step
])

# Fit the pipeline with balanced class weights
weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)
log_reg_class.fit(X_train, y_train, model__sample_weight=weights)

# Generate predictions
y_pred = log_reg_class.predict(X_test)

# Evaluate metrics
metrics = get_metrics(y_test, y_pred)

# View results
metrics

{'Accuracy': 0.8390977443609022,
 'Balanced Accuracy': np.float64(0.9097574559827525),
 'Macro Recall': 0.8390977443609022,
 'Macro Precision': 0.9788410370895997,
 'Macro F1': 0.8960718171509856,
 'F1 Scores per Class': array([0.36521739, 0.91052335, 0.27118644, 0.61333333, 0.04278075])}

In [14]:
# Define the pipeline for Logistic regression and SMOTE
log_reg_pca_smote = Pipeline(steps=[
    ('smote', SMOTE(random_state=2023)),  # SMOTE step to balance the dataset
    ('model', LogisticRegression(random_state=2023, max_iter=500))  # Logistic Regression step
])

# Fit the pipeline
log_reg_pca_smote.fit(X_train, y_train)

# Generate predictions
y_pred = log_reg_pca_smote.predict(X_test)

# Evaluate metrics
metrics = get_metrics(y_test, y_pred)

# View results
metrics

{'Accuracy': 0.8756892230576441,
 'Balanced Accuracy': np.float64(0.9172987782968021),
 'Macro Recall': 0.8756892230576441,
 'Macro Precision': 0.9809086543341904,
 'Macro F1': 0.9198262534962885,
 'F1 Scores per Class': array([0.46153846, 0.93219405, 0.33333333, 0.73015873, 0.05      ])}

In [15]:
# Define the pipeline for KNeighborsClassifier model

knn_model_smote = Pipeline(steps=[
    ('smote', SMOTE(random_state=2023)),
    ('model', KNeighborsClassifier(n_neighbors=3))  # KNeighborsClassifier step
])

# Fit the pipeline without sample weights
knn_model_smote.fit(X_train, y_train)

# Generate predictions
y_pred = knn_model_smote.predict(X_test)

# Evaluate metrics
metrics = get_metrics(y_test, y_pred)

# View results
metrics

{'Accuracy': 0.9413533834586466,
 'Balanced Accuracy': np.float64(0.7496909809558031),
 'Macro Recall': 0.9413533834586466,
 'Macro Precision': 0.9777431525061278,
 'Macro F1': 0.9569057325560987,
 'F1 Scores per Class': array([0.50666667, 0.9698253 , 0.36363636, 0.71698113, 0.07843137])}