Imports y configuración básica de la API

In [19]:
import requests
import pandas as pd
import numpy as np

from datetime import datetime

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [20]:
API_ROOT = "http://127.0.0.1:8000"
API_PREFIX = "/api"

# Autenticación: ajusta según uses bearer/basic/x-api-key
AUTH_MODE = "x-api-key"
X_API_KEY = "zxcvbnm"   # pon aquí tu api-key real
BEARER_TOKEN = ""
BASIC_USER = ""
BASIC_PASS = ""

session = requests.Session()
headers = {}
auth = None

if AUTH_MODE == "bearer" and BEARER_TOKEN:
    headers["Authorization"] = f"Bearer {BEARER_TOKEN}"
elif AUTH_MODE == "basic" and BASIC_USER:
    auth = (BASIC_USER, BASIC_PASS)
elif AUTH_MODE == "x-api-key" and X_API_KEY:
    headers["x-api-key"] = X_API_KEY

print("Headers usados:", headers)

# Señal principal, ventana, y filtros de rango
MAIN_VAR   = "Acceleration RMS (Radial)"
WINDOW_S   = 7200   # debe coincidir con el valor que usaste en features

ASSET_CODIGO = "37156"       # ajusta
MOTOR_CODIGO = "Motor_001"   # ajusta

TS_FROM = datetime(2025, 8, 11, 0, 0, 0)   # ajusta rango
TS_TO   = datetime(2025, 8, 30, 0, 0, 0)


Headers usados: {'x-api-key': 'zxcvbnm'}


Helper para paginar (mediciones y características)

In [21]:
def fetch_all_paginated(base_url, params=None, limit=1000):
    """
    Descarga todos los registros de un endpoint paginado tipo:
      GET ?limit=...&offset=...
    Devuelve una lista de dicts.
    """
    if params is None:
        params = {}

    all_items = []
    offset = 0

    while True:
        p = params.copy()
        p["limit"] = limit
        p["offset"] = offset

        print(f"GET {base_url}  offset={offset}")
        r = session.get(base_url, params=p, headers=headers, auth=auth, timeout=60)
        if r.status_code != 200:
            print("❌ Error:", r.status_code, r.text)
            r.raise_for_status()

        batch = r.json()
        if not isinstance(batch, list):
            print("⚠ Respuesta inesperada, esperaba lista.")
            break

        n = len(batch)
        if n == 0:
            break

        all_items.extend(batch)
        if n < limit:
            break

        offset += limit

    print(f"Total items descargados desde {base_url}: {len(all_items)}")
    return all_items


Descargar mediciones desde /api/mediciones

In [22]:
def load_mediciones(
    asset_codigo: str,
    motor_codigo: str,
    variable: str,
    ts_from: datetime,
    ts_to: datetime,
) -> pd.DataFrame:
    base_url = f"{API_ROOT}{API_PREFIX}/mediciones"

    params = {
        "asset_codigo": asset_codigo,
        "motor_codigo": motor_codigo,
        "variable": variable,
        "ts_from": ts_from.isoformat(),
        "ts_to": ts_to.isoformat(),
        "sort": "ts_utc",   # orden cronológico ascendente
    }

    items = fetch_all_paginated(base_url, params=params, limit=1000)
    df = pd.DataFrame(items)

    if df.empty:
        print("⚠ No se obtuvieron mediciones.")
        return df

    # Aseguramos tipos
    df["ts_utc"] = pd.to_datetime(df["ts_utc"])
    if "valor" in df.columns:
        df["valor"] = pd.to_numeric(df["valor"], errors="coerce")

    print("Columnas mediciones:", df.columns.tolist())
    print(df.head())
    return df


In [23]:
#Prueba de carga

df_med = load_mediciones(
    ASSET_CODIGO,
    MOTOR_CODIGO,
    MAIN_VAR,
    TS_FROM,
    TS_TO,
)

len(df_med)


GET http://127.0.0.1:8000/api/mediciones  offset=0
Total items descargados desde http://127.0.0.1:8000/api/mediciones: 985
Columnas mediciones: ['despliegue_id', 'ts_utc', 'variable', 'valor', 'indicador_calidad', 'medicion_id', 'ts_local_tz', 'importado_en']
   despliegue_id                    ts_utc                   variable   valor  \
0              1 2025-08-11 02:40:01-05:00  Acceleration RMS (Radial)  0.1873   
1              1 2025-08-11 02:55:00-05:00  Acceleration RMS (Radial)  0.1898   
2              1 2025-08-11 03:10:01-05:00  Acceleration RMS (Radial)  0.1941   
3              1 2025-08-11 03:25:00-05:00  Acceleration RMS (Radial)  0.1938   
4              1 2025-08-11 03:40:01-05:00  Acceleration RMS (Radial)  0.1830   

   indicador_calidad  medicion_id          ts_local_tz  \
0                  0         4751  2025-08-11T02:40:01   
1                  0         4763  2025-08-11T02:55:00   
2                  0         4795  2025-08-11T03:10:01   
3                  0 

985

Descargar características desde /api/caracteristicas

In [24]:
def load_caracteristicas(
    asset_codigo: str,
    motor_codigo: str,
    variable: str,
    ventana_s: int,
    ts_from: datetime,
    ts_to: datetime,
) -> pd.DataFrame:
    base_url = f"{API_ROOT}{API_PREFIX}/caracteristicas"

    params = {
        "asset_codigo": asset_codigo,
        "motor_codigo": motor_codigo,
        "variable": variable,
        "ventana_s": ventana_s,
        "ts_from": ts_from.isoformat(),
        "ts_to": ts_to.isoformat(),
        "sort": "ts_utc",
    }

    items = fetch_all_paginated(base_url, params=params, limit=1000)
    df = pd.DataFrame(items)

    if df.empty:
        print("⚠ No se obtuvieron características.")
        return df

    df["ts_utc"] = pd.to_datetime(df["ts_utc"])
    if "valor" in df.columns:
        df["valor"] = pd.to_numeric(df["valor"], errors="coerce")

    print("Columnas caracteristicas:", df.columns.tolist())
    print(df.head())
    return df


In [25]:
#prueba de caracteristicas

df_feats_long = load_caracteristicas(
    ASSET_CODIGO,
    MOTOR_CODIGO,
    MAIN_VAR,
    WINDOW_S,
    TS_FROM,
    TS_TO,
)

len(df_feats_long)


GET http://127.0.0.1:8000/api/caracteristicas  offset=0
GET http://127.0.0.1:8000/api/caracteristicas  offset=1000
Total items descargados desde http://127.0.0.1:8000/api/caracteristicas: 1089
Columnas caracteristicas: ['despliegue_id', 'ts_utc', 'variable', 'caracteristica', 'valor', 'ventana_s', 'indicador_calidad', 'caracteristica_id', 'ts_local_tz', 'generado_en']
   despliegue_id                    ts_utc                   variable  \
0              1 2025-08-11 04:25:00-05:00  Acceleration RMS (Radial)   
1              1 2025-08-11 04:25:00-05:00  Acceleration RMS (Radial)   
2              1 2025-08-11 04:25:00-05:00  Acceleration RMS (Radial)   
3              1 2025-08-11 04:25:00-05:00  Acceleration RMS (Radial)   
4              1 2025-08-11 04:25:00-05:00  Acceleration RMS (Radial)   

     caracteristica     valor  ventana_s  indicador_calidad  \
0     count_samples  8.000000       7200                  0   
1  fft_energy_total  0.000892       7200                  0   
2

1089

Construir el dataset X / y

In [26]:
# 1) Mediciones: ordenar y calcular y_next y quality_next

if df_med.empty or df_feats_long.empty:
    raise RuntimeError("Faltan datos en df_med o df_feats_long, revisa filtros de API.")

df_med = df_med.sort_values(["despliegue_id", "ts_utc"]).copy()

# Shift por despliegue: próximo valor y próxima calidad
df_med["y_next"] = df_med.groupby("despliegue_id")["valor"].shift(-1)
df_med["quality_next"] = df_med.groupby("despliegue_id")["indicador_calidad"].shift(-1)

# 2) Características: convertir de formato largo a ancho (una fila por ventana)

# df_feats_long debe tener: despliegue_id, ts_utc, caracteristica, valor
# pivot: columnas = nombre de caracteristica
df_feats_wide = (
    df_feats_long
    .pivot_table(
        index=["despliegue_id", "ts_utc"],
        columns="caracteristica",
        values="valor",
        aggfunc="mean",  # por si acaso
    )
    .reset_index()
)

print("df_feats_wide columnas:", df_feats_wide.columns.tolist())
df_feats_wide.head()


df_feats_wide columnas: ['despliegue_id', 'ts_utc', 'count_samples', 'fft_energy_total', 'fft_peak_amp', 'fft_peak_bin', 'max', 'mean', 'min', 'rms_window', 'std']


caracteristica,despliegue_id,ts_utc,count_samples,fft_energy_total,fft_peak_amp,fft_peak_bin,max,mean,min,rms_window,std
0,1,2025-08-11 04:25:00-05:00,8.0,0.000892,0.024724,2.0,0.1941,0.189125,0.179,0.189199,0.005643
1,1,2025-08-11 06:25:00-05:00,8.0,0.001152,0.021761,2.0,0.1948,0.1905,0.1775,0.190583,0.005997
2,1,2025-08-11 08:25:00-05:00,8.0,0.00116,0.023668,1.0,0.1899,0.180725,0.1729,0.180823,0.006364
3,1,2025-08-11 10:25:00-05:00,8.0,0.000788,0.018626,2.0,0.1895,0.183587,0.1721,0.183654,0.005298
4,1,2025-08-11 12:25:00-05:00,8.0,0.006467,0.070417,1.0,0.1925,0.170088,0.1543,0.170673,0.015096


In [27]:
# 3) Unir features (X) con mediciones (y_next)

# Nos quedamos solo con columnas relevantes de mediciones
df_med_sel = df_med[[
    "despliegue_id",
    "ts_utc",
    "valor",               # valor actual
    "y_next",              # target
    "indicador_calidad",   # calidad actual
    "quality_next",        # calidad de próxima medición
]]

df_dataset = df_feats_wide.merge(
    df_med_sel,
    how="inner",
    on=["despliegue_id", "ts_utc"],
)

print("Filas después del merge features + mediciones:", len(df_dataset))

# Quitamos filas sin destino (sin próxima medición)
df_dataset = df_dataset[~df_dataset["y_next"].isna()].copy()
print("Filas con y_next disponible:", len(df_dataset))

df_dataset.head()


Filas después del merge features + mediciones: 121
Filas con y_next disponible: 120


Unnamed: 0,despliegue_id,ts_utc,count_samples,fft_energy_total,fft_peak_amp,fft_peak_bin,max,mean,min,rms_window,std,valor,y_next,indicador_calidad,quality_next
0,1,2025-08-11 04:25:00-05:00,8.0,0.000892,0.024724,2.0,0.1941,0.189125,0.179,0.189199,0.005643,0.1941,0.1941,0,0.0
1,1,2025-08-11 06:25:00-05:00,8.0,0.001152,0.021761,2.0,0.1948,0.1905,0.1775,0.190583,0.005997,0.1775,0.1781,0,0.0
2,1,2025-08-11 08:25:00-05:00,8.0,0.00116,0.023668,1.0,0.1899,0.180725,0.1729,0.180823,0.006364,0.1886,0.1857,0,0.0
3,1,2025-08-11 10:25:00-05:00,8.0,0.000788,0.018626,2.0,0.1895,0.183587,0.1721,0.183654,0.005298,0.1876,0.191,0,0.0
4,1,2025-08-11 12:25:00-05:00,8.0,0.006467,0.070417,1.0,0.1925,0.170088,0.1543,0.170673,0.015096,0.1582,0.1589,0,0.0


Construir dataset “sucio” y “limpio”

In [28]:
# Dataset "sucio": solo exigimos que exista y_next
df_dirty = df_dataset.copy()
print("Dataset sucio:", df_dirty.shape)

# Dataset "limpio": calidad 0 en ventana y en próxima medición
df_clean = df_dataset[
    (df_dataset["indicador_calidad"] == 0)
    & (df_dataset["quality_next"] == 0)
].copy()

print("Dataset limpio:", df_clean.shape)


Dataset sucio: (120, 15)
Dataset limpio: (120, 15)


Separar features y target

In [29]:
# Columnas que no son features
non_feature_cols = {
    "despliegue_id",
    "ts_utc",
    "valor",              # valor actual
    "y_next",
    "indicador_calidad",
    "quality_next",
}

feature_cols = [c for c in df_clean.columns if c not in non_feature_cols]

print("Features usadas:", feature_cols)

X_clean = df_clean[feature_cols].values
y_clean = df_clean["y_next"].values
baseline_clean = df_clean["valor"].values   # baseline: próxima medición ≈ valor actual

# Para comparación, también preparamos el dataset sucio
X_dirty = df_dirty[feature_cols].values
y_dirty = df_dirty["y_next"].values
baseline_dirty = df_dirty["valor"].values


Features usadas: ['count_samples', 'fft_energy_total', 'fft_peak_amp', 'fft_peak_bin', 'max', 'mean', 'min', 'rms_window', 'std']


Split train/test respetando el orden temporal

In [30]:
def train_test_split_time_series(X, y, baseline, train_ratio=0.7):
    n = len(y)
    n_train = int(n * train_ratio)

    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test  = X[n_train:]
    y_test  = y[n_train:]
    baseline_test = baseline[n_train:]

    return X_train, X_test, y_train, y_test, baseline_test

Xc_tr, Xc_te, yc_tr, yc_te, basec_te = train_test_split_time_series(
    X_clean, y_clean, baseline_clean, train_ratio=0.7
)

Xd_tr, Xd_te, yd_tr, yd_te, based_te = train_test_split_time_series(
    X_dirty, y_dirty, baseline_dirty, train_ratio=0.7
)

len(yc_tr), len(yc_te), len(yd_tr), len(yd_te)


(84, 36, 84, 36)

Definir modelos y métricas

In [31]:
def evaluate_regression(y_true, y_pred, label="modelo"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"[{label}] MAE = {mae:.4f}, RMSE = {rmse:.4f}")
    return mae, rmse


In [32]:
# Modelo 1: Baseline (persistencia) → y_pred = valor actual
# No necesita entrenamiento

# Modelo 2: Regresión lineal con escalado
lin_reg = Pipeline([
    ("scaler", StandardScaler()),
    ("reg", LinearRegression()),
])

# Modelo 3: Random Forest
rf_reg = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
)


Entrenar y evaluar en dataset LIMPIO

In [33]:
print("=== ESCENARIO LIMPIO (quality_code == 0) ===\n")

# Baseline
evaluate_regression(yc_te, basec_te, label="Baseline (valor actual)")

# Linear Regression
lin_reg.fit(Xc_tr, yc_tr)
yc_pred_lin = lin_reg.predict(Xc_te)
evaluate_regression(yc_te, yc_pred_lin, label="LinearRegression")

# Random Forest
rf_reg.fit(Xc_tr, yc_tr)
yc_pred_rf = rf_reg.predict(Xc_te)
evaluate_regression(yc_te, yc_pred_rf, label="RandomForest")


=== ESCENARIO LIMPIO (quality_code == 0) ===

[Baseline (valor actual)] MAE = 0.0027, RMSE = 0.0034
[LinearRegression] MAE = 0.0069, RMSE = 0.0091
[RandomForest] MAE = 0.0070, RMSE = 0.0096


(np.float64(0.007025527777777796), np.float64(0.009554438275685776))

Entrenar y evaluar en dataset SUCIO (para comparar)

In [34]:
print("=== ESCENARIO SUCIO (sin filtrar calidad) ===\n")

# Baseline
evaluate_regression(yd_te, based_te, label="Baseline (valor actual)")

# Linear Regression
lin_reg.fit(Xd_tr, yd_tr)
yd_pred_lin = lin_reg.predict(Xd_te)
evaluate_regression(yd_te, yd_pred_lin, label="LinearRegression")

# Random Forest
rf_reg.fit(Xd_tr, yd_tr)
yd_pred_rf = rf_reg.predict(Xd_te)
evaluate_regression(yd_te, yd_pred_rf, label="RandomForest")


=== ESCENARIO SUCIO (sin filtrar calidad) ===

[Baseline (valor actual)] MAE = 0.0027, RMSE = 0.0034
[LinearRegression] MAE = 0.0069, RMSE = 0.0091
[RandomForest] MAE = 0.0070, RMSE = 0.0096


(np.float64(0.0070255277777778), np.float64(0.009554438275685782))