In [39]:
# -*- coding: utf-8 -*-
"""
Notebook 3: Calidad de Datos en Mantenimiento Predictivo
Tema 3: Datos ausentes, redundancia y normalización.
Dataset: APS Failure at Scania Trucks (UCI/Kaggle)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# 1. CARGA DE DATOS MEDIANTE KAGGLEHUB (Sugerencia del usuario)
print("Descargando y cargando dataset desde Kaggle...")
# El file_path "" carga el archivo principal por defecto
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "uciml/aps-failure-at-scania-trucks-data-set",
  "aps_failure_training_set.csv",
)

Descargando y cargando dataset desde Kaggle...


  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'aps-failure-at-scania-trucks-data-set' dataset.


In [40]:
# 2. PRE-PROCESAMIENTO INICIAL (Convertir 'na' a NaN)
# El dataset de Scania usa 'na' como string. Debemos convertirlo para que Pandas lo reconozca.
df = df.replace('na', np.nan)

print(df.head())

print(df.describe())

print(f"--- Dataset Cargado: {df.shape[0]} filas y {df.shape[1]} columnas ---")


  class  aa_000 ab_000      ac_000 ad_000 ae_000 af_000 ag_000 ag_001 ag_002  \
0   neg   76698    NaN  2130706438    280      0      0      0      0      0   
1   neg   33058    NaN           0    NaN      0      0      0      0      0   
2   neg   41040    NaN         228    100      0      0      0      0      0   
3   neg      12      0          70     66      0     10      0      0      0   
4   neg   60874    NaN        1368    458      0      0      0      0      0   

   ...   ee_002  ee_003  ee_004  ee_005  ee_006  ee_007  ee_008 ee_009 ef_000  \
0  ...  1240520  493384  721044  469792  339156  157956   73224      0      0   
1  ...   421400  178064  293306  245416  133654   81140   97576   1500      0   
2  ...   277378  159812  423992  409564  320746  158022   95128    514      0   
3  ...      240      46      58      44      10       0       0      0      4   
4  ...   622012  229790  405298  347188  286954  311560  433954   1218      0   

  eg_000  
0      0  
1      0  

In [41]:
# 3. IDENTIFICACIÓN DE DATOS AUSENTES
# Convertimos columnas a numéricas (excepto 'class') para poder operar
for col in df.columns:
    if col != 'class':
        df[col] = pd.to_numeric(df[col], errors='coerce')

missing_pct = df.isnull().mean() * 100
print("\nColumnas con más del 50% de datos nulos:")
print(missing_pct[missing_pct > 50].sort_values(ascending=False))


Columnas con más del 50% de datos nulos:
br_000    82.106667
bq_000    81.203333
bp_000    79.566667
bo_000    77.221667
ab_000    77.215000
cr_000    77.215000
bn_000    73.348333
bm_000    65.915000
dtype: float64


In [42]:
# 4. LIMPIEZA: ELIMINACIÓN DE ATRIBUTOS REDUNDANTES (Cap 3.2)
# Eliminamos columnas con varianza casi nula o demasiados nulos
cols_to_keep = missing_pct[missing_pct < 60].index
df_filtered = df[cols_to_keep]

In [37]:
# 5. IMPUTACIÓN (Cap 3.5)
# Rellenamos los huecos con la mediana (estrategia robusta industrial)
imputer = SimpleImputer(strategy='median')
# Separamos la etiqueta para no imputarla
X = df_filtered.drop(columns=['class'])
y = df_filtered['class']


X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [38]:
# 6. NORMALIZACIÓN Z-SCORE (Cap 3.7)
# Fundamental para que sensores con diferentes escalas (presión vs temperatura) sean comparables
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)

print("\n--- Resultado de la Normalización (Z-Score) ---")
print(X_scaled.head())



--- Resultado de la Normalización (Z-Score) ---
     aa_000    ac_000    ad_000    ae_000    af_000    ag_000   ag_001  \
0  0.119381  2.310224 -0.004085 -0.041322 -0.051358 -0.010762 -0.02837   
1 -0.180697 -0.432859 -0.004089 -0.041322 -0.051358 -0.010762 -0.02837   
2 -0.125811 -0.432859 -0.004090 -0.041322 -0.051358 -0.010762 -0.02837   
3 -0.407928 -0.432859 -0.004091 -0.041322 -0.002669 -0.010762 -0.02837   
4  0.010572 -0.432857 -0.004080 -0.041322 -0.051358 -0.010762 -0.02837   

     ag_002    ag_003    ag_004  ...    ee_002    ee_003    ee_004    ee_005  \
0 -0.056929 -0.115643 -0.167274  ...  0.693832  0.524393  0.239087  0.070072   
1 -0.056929 -0.115643 -0.175319  ... -0.018901 -0.059135 -0.129021 -0.131171   
2 -0.056929 -0.115643 -0.182351  ... -0.144217 -0.092912 -0.016553  0.016053   
3 -0.056929 -0.115223 -0.182112  ... -0.385361 -0.388574 -0.381387 -0.351244   
4 -0.056929 -0.115643 -0.164521  ...  0.155656  0.036588 -0.032641 -0.039892   

     ee_006    ee_007    