In [13]:
# ============================================================
# Limpieza y preparación de datos de calidad del aire (Colab)
# ============================================================
# Requisitos (en una celda aparte si hace falta):
# !pip install numbers-parser pandas numpy openpyxl xlrd scikit-learn pyjanitor python-dateutil

import os
import re
import math
import warnings
from typing import Dict, Tuple, List

import numpy as np
import pandas as pd
from dateutil import parser as dtparser
from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from IPython.display import display
from pandas.api.types import (
    is_numeric_dtype,
    is_datetime64_any_dtype,
    is_bool_dtype,
    is_categorical_dtype,
)

warnings.filterwarnings("ignore", category=FutureWarning)

# =========================================
# 0) Parámetros del usuario / rutas base
# =========================================
PATH_DATA_NUMBERS = "/content/DATOS HISTORICOS 2023_2024_TODAS ESTACIONES_ITESM.numbers"
PATH_DATA_XLSX    = "/content/DATOS HISTORICOS 2023_2024_TODAS ESTACIONES_ITESM.xlsx"  # usando Excel
PATH_ETIQUETAS    = "/content/Etiquetas.xlsx"

# Si tu archivo está en .numbers, usa READ_NUMBERS=True. Si ya exportaste a .xlsx, pon False.
READ_NUMBERS = False  # ← ahora que cambiaste a Excel

# ¿Crear variables dummy para categóricas?
CREATE_DUMMIES = True

# ¿Estrategia de imputación para NA al final del paso de limpieza?
#   None            -> no imputar (se deja NaN tras interpolar y ffill/bfill)
#   "median"        -> imputación por mediana
#   "mean"          -> imputación por media
IMPUTE_STRATEGY = None

# ¿Escalado de numéricas?
#   None | "standard" | "minmax"
SCALING = "standard"

# ¿Binning (discretización) de numéricas seleccionadas?
#   None | "quantile" | "uniform"   (usa KBinsDiscretizer)
BINNING = None
N_BINS  = 5

# Columnas esperadas de mediciones (nombres estándar que forzaremos)
EXPECTED_COLS = [
    "PM10","PM2.5","O3","NO","NO2","NOx","SO2","CO","RH","WS","TEMP","SR","BP","WD","RAINF"
]

# Nombre de columna de tiempo
TIME_COL = "DATETIME"

# =========================================
# 1) RANGOS por año (del PDF)
# =========================================
RANGOS_POR_ANIO: Dict[int, Dict[str, Tuple[float, float]]] = {
    2020: {"PM10":(0,800),"PM2.5":(0,205.94),"O3":(0,153),"NO":(0,500),"NO2":(0,200),"NOx":(0,500),
           "SO2":(0,200),"CO":(0,20),"RH":(0,100),"WS":(0,75),"TEMP":(0,41),"SR":(0,1),"BP":(690,750),
           "WD":(0,360),"RAINF":(0,30)},
    2021: {"PM10":(0,800),"PM2.5":(0,325),"O3":(0,175),"NO":(0,350),"NO2":(0,100),"NOx":(0,400),
           "SO2":(0,300),"CO":(0,10),"RH":(0,100),"WS":(0,40),"TEMP":(-6.5,45),"SR":(0,1),
           "BP":(690,740),"WD":(0,360),"RAINF":(0,80)},
    2022: {"PM10":(0,999),"PM2.5":(0,450),"O3":(0,160),"NO":(0,400),"NO2":(0,175),"NOx":(0,420),
           "SO2":(0,200),"CO":(0,8),"RH":(0,100),"WS":(0,35),"TEMP":(-5,45),"SR":(0,1.25),
           "BP":(700,740),"WD":(0,360),"RAINF":(0,25)},
    2023: {"PM10":(0,900),"PM2.5":(0,800),"O3":(0,175),"NO":(0,500),"NO2":(0,175),"NOx":(0,500),
           "SO2":(0,250),"CO":(0,14),"RH":(0,100),"WS":(0,40),"TEMP":(0,45),"SR":(0,1),
           "BP":(690,740),"WD":(0,360),"RAINF":(0,70)},
    2024: {"PM10":(0,999),"PM2.5":(0,999),"O3":(0,180),"NO":(0,400),"NO2":(0,130),"NOx":(0,500),
           "SO2":(0,150),"CO":(0,18),"RH":(0,100),"WS":(0,38),"TEMP":(-4,45.5),"SR":(0,1.26),
           "BP":(687.5,740),"WD":(0,360),"RAINF":(0,50)},
    2025: {"PM10":(0,820),"PM2.5":(0,350),"O3":(0,185),"NO":(0,350),"NO2":(0,175),"NOx":(0,400),
           "SO2":(0,405),"CO":(0,10),"RH":(0,100),"WS":(0,40),"TEMP":(-4.5,45),"SR":(0,1.2),
           "BP":(688,740),"WD":(0,360),"RAINF":(0,25)}
}

# =========================================
# 2) Utilidades
# =========================================

def try_parse_datetime(x):
    if pd.isna(x):
        return pd.NaT
    if isinstance(x, pd.Timestamp):
        return x
    try:
        return pd.to_datetime(x, errors="coerce")
    except Exception:
        try:
            return pd.to_datetime(dtparser.parse(str(x)), errors="coerce")
        except Exception:
            return pd.NaT

def coerce_numeric(series: pd.Series) -> pd.Series:
    """Convierte strings '1,234.5' o '1.234,5' y otros a float; deja NaN si no se puede."""
    s = series.astype(str).str.replace(r"[^\d\-\.,]", "", regex=True).str.strip()
    # Si hay comas y puntos: asume 1,234.56 (inglés) → quita comas
    s = pd.Series(np.where(s.str.contains(",") & s.str.contains(r"\."), s.str.replace(",", "", regex=False), s))
    # Si hay comas pero no puntos: asume decimal con coma (1.234,56 → 1234.56)
    s = pd.Series(np.where((~pd.isna(s)) & (pd.Series(s).str.contains(",") & ~pd.Series(s).str.contains(r"\.")),
                 pd.Series(s).str.replace(".", "", regex=False).str.replace(",", ".", regex=False),
                 s))
    out = pd.to_numeric(pd.Series(s), errors="coerce")
    return out

def standardize_column_names(cols: List[str]) -> List[str]:
    """Normaliza nombres típicos para mapear a EXPECTED_COLS."""
    mapping = {
        r"^pm\s*10$":"PM10",
        r"^pm10$":"PM10",
        r"^pm\s*2\.?5$":"PM2.5",
        r"^pm2\.?5$":"PM2.5",
        r"^o3$":"O3",
        r"^no$":"NO",
        r"^no2$":"NO2",
        r"^nox$":"NOx",
        r"^so2$":"SO2",
        r"^co$":"CO",
        r"^rh$":"RH",
        r"^ws$":"WS",
        r"^(temp|temperature)$":"TEMP",
        r"^sr$":"SR",
        r"^bp$":"BP",
        r"^wd$":"WD",
        r"^(rain|rainf|rainfall|precip.*)$":"RAINF",
        r"^(date|datetime|fecha.*hora|fecha_hora|time.*)$":TIME_COL,
        r"^(estacion|estación|station)$":"STATION",
    }
    new_cols = []
    for c in cols:
        c0 = str(c).strip()
        c1 = re.sub(r"\s+", " ", c0)
        c2 = c1.upper()
        new = c2
        for pat, tgt in mapping.items():
            if re.match(pat, c2, flags=re.IGNORECASE):
                new = tgt
                break
        new_cols.append(new)
    return new_cols

def add_station_if_missing(df: pd.DataFrame, station_name: str) -> pd.DataFrame:
    if "STATION" not in df.columns:
        df["STATION"] = station_name
    return df

def concat_sheets_with_station(reader, sheet_names):
    dfs = []
    for sh in sheet_names:
        try:
            df = reader.parse(sh)
        except Exception:
            df = reader.parse(sh, header=0)
        df.columns = standardize_column_names(list(df.columns))
        df = add_station_if_missing(df, sh)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

def read_numbers_as_dataframe(path_numbers: str) -> pd.DataFrame:
    """Lee un archivo .numbers como tablas y concatena."""
    try:
        from numbers_parser import Document
    except ImportError as e:
        raise RuntimeError("Falta instalar 'numbers-parser'. Ejecuta: !pip install numbers-parser") from e

    doc = Document(path_numbers)
    frames = []
    for sheet in doc.sheets:
        for table in sheet.tables:
            data = table.rows(values_only=True)
            if not data:
                continue
            df = pd.DataFrame(data[1:], columns=data[0]) if len(data) > 1 else pd.DataFrame(columns=data[0])
            df.columns = standardize_column_names(df.columns.tolist())
            df = add_station_if_missing(df, sheet.name)
            frames.append(df)
    if not frames:
        return pd.DataFrame(columns=[TIME_COL, "STATION"] + EXPECTED_COLS)
    return pd.concat(frames, ignore_index=True)

def read_data() -> pd.DataFrame:
    if READ_NUMBERS and os.path.exists(PATH_DATA_NUMBERS):
        df = read_numbers_as_dataframe(PATH_DATA_NUMBERS)
    elif os.path.exists(PATH_DATA_XLSX):
        xls = pd.ExcelFile(PATH_DATA_XLSX)
        df = concat_sheets_with_station(xls, xls.sheet_names)
    else:
        raise FileNotFoundError("No se encontró el archivo .numbers ni el .xlsx. Sube uno de los dos con el nombre esperado.")
    # Normaliza nombres y asegura columnas esperadas
    df.columns = standardize_column_names(df.columns.tolist())
    for c in EXPECTED_COLS:
        if c not in df.columns:
            df[c] = np.nan
    if TIME_COL in df.columns:
        df[TIME_COL] = df[TIME_COL].apply(try_parse_datetime)
    return df

def read_etiquetas():
    """Lee un diccionario de etiquetas si existe; si no, retorna None."""
    if not os.path.exists(PATH_ETIQUETAS):
        return None
    try:
        et = pd.read_excel(PATH_ETIQUETAS)
        return et
    except Exception:
        return None

# =========================================
# 3) 1) Comprensión de los datos (versión robusta)
# =========================================

def describe_variables(df: pd.DataFrame, etiquetas: pd.DataFrame | None) -> pd.DataFrame:
    """
    Devuelve un dataframe con:
        nombre, descripcion, tipo, valores_posibles, nulos
    Soporta numéricas, categóricas, booleanas y fecha/hora.
    """
    summary = []
    for c in df.columns:
        ser = df[c]
        n_null = int(ser.isna().sum())

        if is_datetime64_any_dtype(ser):
            tipo = "Fecha/Hora"
            if ser.notna().any():
                vmin = pd.to_datetime(ser.min())
                vmax = pd.to_datetime(ser.max())
                vals = [vmin.isoformat(), vmax.isoformat()]
            else:
                vals = [np.nan, np.nan]

        elif is_bool_dtype(ser):
            tipo = "Categórico (bool)"
            uniq = ser.dropna().unique().tolist()
            vals = uniq[:20] + (["..."] if len(uniq) > 20 else [])

        elif is_categorical_dtype(ser) or ser.dtype == "object":
            tipo = "Categórico"
            uniq = ser.dropna().unique().tolist()
            vals = uniq[:20] + (["..."] if len(uniq) > 20 else [])

        elif is_numeric_dtype(ser):
            tipo = "Numérico"
            if ser.notna().any():
                vmin = np.nanmin(ser.astype(float))
                vmax = np.nanmax(ser.astype(float))
                vals = [float(vmin), float(vmax)]
            else:
                vals = [np.nan, np.nan]

        else:
            tipo = str(ser.dtype)
            uniq = ser.dropna().unique().tolist()
            vals = uniq[:20] + (["..."] if len(uniq) > 20 else [])

        summary.append({
            "variable": c,
            "descripcion": None,
            "tipo": tipo,
            "valores_posibles": vals,
            "nulos": n_null
        })

    desc_df = pd.DataFrame(summary)

    # Integración opcional con Etiquetas.xlsx
    if etiquetas is not None:
        et = etiquetas.copy()
        if "variable" not in et.columns:
            cand = [col for col in et.columns if "var" in col.lower() or "nombre" in col.lower()]
            if cand:
                et = et.rename(columns={cand[0]: "variable"})
        for candidate, target in [
            ("descripcion","descripcion"),
            ("descripción","descripcion"),
            ("tipo","tipo_usuario"),
            ("valores","valores_usuario"),
            ("valores_posibles","valores_usuario")
        ]:
            if candidate in et.columns and target not in et.columns:
                et = et.rename(columns={candidate: target})
        if "variable" in et.columns:
            desc_df = desc_df.merge(et, on="variable", how="left")
            if "tipo_usuario" in desc_df.columns:
                desc_df["tipo"] = desc_df["tipo_usuario"].fillna(desc_df["tipo"])
            if "valores_usuario" in desc_df.columns:
                desc_df["valores_posibles"] = desc_df["valores_usuario"].combine_first(desc_df["valores_posibles"])
            for col in ["tipo_usuario", "valores_usuario"]:
                if col in desc_df.columns:
                    desc_df.drop(columns=[col], inplace=True)

    return desc_df

def quality_checks(df: pd.DataFrame) -> dict:
    qc = {}
    qc["shape"] = df.shape
    qc["missing_by_col"] = df.isna().sum().sort_values(ascending=False).to_dict()
    qc["has_infinite"] = np.isinf(df.select_dtypes(include=[np.number]).to_numpy()).any()
    qc["duplicated_rows"] = int(df.duplicated().sum())
    return qc

# =========================================
# 4) 2) Preparación de los datos
# =========================================

def select_and_explain(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
    """Selecciona columnas de tiempo, estación y variables estándar."""
    keep = [TIME_COL, "STATION"] + EXPECTED_COLS
    keep = [c for c in keep if c in df.columns]
    out = df[keep].copy()
    explanation = (
        "Se seleccionaron las columnas de tiempo (DATETIME), STATION y variables de medición estándar. "
        "Se excluyeron columnas auxiliares para evitar ruido y garantizar comparabilidad."
    )
    return out, explanation

def identify_targets(df: pd.DataFrame) -> List[str]:
    """Columnas objetivo sugeridas (ajústalo a tu análisis)."""
    return [c for c in ["PM2.5","PM10"] if c in df.columns]

def fix_types(df: pd.DataFrame) -> pd.DataFrame:
    # Convierte mediciones a numérico
    for c in EXPECTED_COLS:
        if c in df.columns:
            df[c] = coerce_numeric(df[c])
    # Asegura datetime y variables derivadas de tiempo
    if TIME_COL in df.columns:
        if not is_datetime64_any_dtype(df[TIME_COL]):
            df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")
        df["YEAR"] = df[TIME_COL].dt.year
        df["MONTH"] = df[TIME_COL].dt.month
        df["DAY"] = df[TIME_COL].dt.day
        df["HOUR"] = df[TIME_COL].dt.hour
    return df

def apply_yearly_ranges(df: pd.DataFrame) -> pd.DataFrame:
    """Marca como NaN los valores fuera de rango conforme al año de la fila."""
    if "YEAR" not in df.columns:
        return df
    def clip_row(row):
        yr = row["YEAR"]
        ranges = RANGOS_POR_ANIO.get(int(yr) if not pd.isna(yr) else yr, {})
        for col, (lo, hi) in ranges.items():
            if col in row and pd.notna(row[col]):
                v = row[col]
                if (v < lo) or (v > hi):
                    row[col] = np.nan
        return row
    return df.apply(clip_row, axis=1)

def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    before = len(df)
    df = df.drop_duplicates()
    after = len(df)
    print(f"Duplicados eliminados (filas exactas): {before - after}")
    if TIME_COL in df.columns and "STATION" in df.columns:
        before = len(df)
        df = df.sort_values(by=[TIME_COL]).drop_duplicates(subset=["STATION", TIME_COL], keep="first")
        after = len(df)
        print(f"Duplicados por clave (STATION, DATETIME): {before - after}")
    return df

def handle_missing(df: pd.DataFrame) -> pd.DataFrame:
    """
    Manejo de NA:
      - Interpolación temporal por estación para series numéricas (con índice de tiempo).
      - ffill/bfill cortos.
      - Imputación global opcional (media/mediana).
    """
    num_cols = [c for c in df.columns if c in EXPECTED_COLS]
    if TIME_COL in df.columns and "STATION" in df.columns:
        # Operar por estación con índice de tiempo para usar method="time"
        def _interpolate_group(g):
            g = g.sort_values(TIME_COL)
            g_idx = g.set_index(TIME_COL)
            for c in num_cols:
                s = g_idx[c]
                s = s.interpolate(method="time")
                s = s.ffill().bfill()
                g_idx[c] = s
            return g_idx.reset_index()
        df = df.groupby("STATION", group_keys=False).apply(_interpolate_group)

    if IMPUTE_STRATEGY in ("mean","median"):
        imputer = SimpleImputer(strategy=IMPUTE_STRATEGY)
        df[num_cols] = imputer.fit_transform(df[num_cols])

    return df

def handle_categoricals(df: pd.DataFrame) -> pd.DataFrame:
    cat_cols = [c for c in df.columns if df[c].dtype == "object" or str(df[c].dtype).startswith("category")]
    if CREATE_DUMMIES and cat_cols:
        df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=int)
    return df

def iqr_outlier_mask(x: pd.Series, k: float = 1.5):
    q1, q3 = np.nanpercentile(x, [25, 75])
    iqr = q3 - q1
    lo, hi = q1 - k*iqr, q3 + k*iqr
    return (x < lo) | (x > hi)

def handle_outliers(df: pd.DataFrame, k: float = 1.5) -> pd.DataFrame:
    """Manejo de atípicos (IQR) por estación y año. Se marcan como NaN."""
    if "STATION" not in df.columns or "YEAR" not in df.columns:
        return df
    num_cols = [c for c in EXPECTED_COLS if c in df.columns]
    def _proc(group):
        for c in num_cols:
            s = group[c]
            if s.notna().sum() >= 12:
                mask = iqr_outlier_mask(s, k=k)
                group.loc[mask, c] = np.nan
        return group
    return df.groupby(["STATION","YEAR"], group_keys=False).apply(_proc)

# =========================================
# 5) 3) Transformación de Datos
# =========================================

def build_features(df: pd.DataFrame) -> pd.DataFrame:
    """Atributos derivados: componentes del viento y variables de calendario."""
    out = df.copy()
    if "WS" in out.columns and "WD" in out.columns:
        ws = out["WS"].astype(float)
        wd_rad = np.deg2rad(out["WD"].astype(float))
        out["WIND_U"] = -ws * np.sin(wd_rad)
        out["WIND_V"] = -ws * np.cos(wd_rad)
    if TIME_COL in out.columns:
        out["WEEKDAY"] = out[TIME_COL].dt.weekday  # 0=lun
        out["IS_WEEKEND"] = (out["WEEKDAY"] >= 5).astype(int)
    return out

def scale_numeric(df: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
    if SCALING not in ("standard","minmax"):
        return df, {}
    num_cols = [c for c in df.columns if c in EXPECTED_COLS or c in ["WIND_U","WIND_V"]]
    num_cols = [c for c in num_cols if c in df.columns]
    scaler = StandardScaler() if SCALING=="standard" else MinMaxScaler()
    scaled = df.copy()
    scaled[num_cols] = scaler.fit_transform(scaled[num_cols])
    info = {"scaler": SCALING, "columns": num_cols}
    return scaled, info

def bin_numeric(df: pd.DataFrame, cols: List[str]) -> Tuple[pd.DataFrame, dict]:
    if BINNING not in ("quantile","uniform") or not cols:
        return df, {}
    binner = KBinsDiscretizer(n_bins=N_BINS, encode="ordinal",
                              strategy=("quantile" if BINNING=="quantile" else "uniform"))
    x = df[cols].copy()
    mask = x.notna().all(axis=1)
    x_notna = x[mask]
    out = df.copy()
    out.loc[mask, [f"{c}_BIN" for c in cols]] = binner.fit_transform(x_notna)
    info = {"binner": BINNING, "columns": cols, "bins": N_BINS}
    return out, info

# =========================================
# 6) 4) Reformateo / Exportación
# =========================================

def tidy_and_export(df: pd.DataFrame, outdir="/content/clean"):
    os.makedirs(outdir, exist_ok=True)
    # Exporta completo
    full_path = os.path.join(outdir, "air_quality_clean_full.parquet")
    df.to_parquet(full_path, index=False)
    print(f"✅ Exportado dataset completo: {full_path}")

    # Por año
    if "YEAR" in df.columns:
        for yr, dfg in df.groupby("YEAR"):
            p = os.path.join(outdir, f"air_quality_clean_{int(yr)}.csv")
            dfg.to_csv(p, index=False)
    # Por estación y año
    if "STATION" in df.columns and "YEAR" in df.columns:
        for (st, yr), dfg in df.groupby(["STATION","YEAR"]):
            fname = re.sub(r"[^A-Za-z0-9\-]+","_", str(st))
            p = os.path.join(outdir, f"clean_{fname}_{int(yr)}.csv")
            dfg.to_csv(p, index=False)
    print("📦 Exportaciones por año y por estación listas en:", outdir)

# =========================================
# 7) RUN: pipeline maestro
# =========================================

def main():
    print("=== Cargando datos ===")
    df = read_data()
    print("Dimensión inicial:", df.shape)

    # -------- Comprensión de datos --------
    print("\n=== 1) Comprensión de los datos ===")
    etiquetas = read_etiquetas()
    desc = describe_variables(df, etiquetas)
    print("Dimensión del dataset:", df.shape)
    print("\nDiccionario de variables (primeras filas):")
    display(desc.head(30))

    qc = quality_checks(df)
    print("\nCalidad de datos:")
    print(" - Registros, Columnas:", qc["shape"])
    print(" - Faltantes por columna (top 10):", dict(list(qc["missing_by_col"].items())[:10]))
    print(" - Valores infinitos:", qc["has_infinite"])
    print(" - Filas duplicadas totales:", qc["duplicated_rows"])

    # -------- Preparación --------
    print("\n=== 2) Preparación de los datos ===")
    df, rationale = select_and_explain(df)
    print("Racional de selección:\n", rationale)

    df = fix_types(df)                # tipos + YEAR/MONTH/DAY/HOUR
    df = apply_yearly_ranges(df)      # rangos por año del PDF
    df = drop_duplicates(df)          # duplicados
    df = handle_outliers(df, k=1.5)   # outliers → NaN
    df = handle_missing(df)           # interpolación por estación + ffill/bfill + imputación opcional

    targets = identify_targets(df)
    print("Columnas objetivo sugeridas:", targets)

    df = handle_categoricals(df)      # dummies para categóricas (si aplica)

    # -------- Transformación --------
    print("\n=== 3) Transformación de Datos ===")
    df = build_features(df)

    df_scaled, scale_info = scale_numeric(df)
    if scale_info:
        print("Escalado aplicado:", scale_info)

    bin_cols = [c for c in ["PM2.5","PM10"] if c in df_scaled.columns]
    df_binned, bin_info = bin_numeric(df_scaled, bin_cols)
    if bin_info:
        print("Binning aplicado:", bin_info)

    # -------- Reformateo / Exportación --------
    print("\n=== 4) Reformateo / Exportación ===")
    tidy_and_export(df_binned)

    # Reporte final breve
    print("\n=== Resumen RÁPIDO ===")
    print("Filas/Columnas finales:", df_binned.shape)
    print("NA totales:", int(df_binned.isna().sum().sum()))
    print("Columnas creadas (derivadas):", [c for c in ["WIND_U","WIND_V","WEEKDAY","IS_WEEKEND"] if c in df_binned.columns])
    print("Listo ✅")

# Ejecutar
if __name__ == "__main__":
    main()


=== Cargando datos ===
Dimensión inicial: (13883, 260)

=== 1) Comprensión de los datos ===


  elif is_categorical_dtype(ser) or ser.dtype == "object":


Dimensión del dataset: (13883, 260)

Diccionario de variables (primeras filas):


Unnamed: 0,variable,descripcion,tipo,valores_posibles,nulos
0,UNNAMED: 0,,Numérico,"[nan, nan]",13883
1,UNNAMED: 1,,Categórico,[This document was exported from Numbers. Eac...,13879
2,UNNAMED: 2,,Categórico,"[Numbers Table Name, Table 1]",13880
3,UNNAMED: 3,,Categórico,"[Excel Worksheet Name, Param_horarios_Estacion...",13880
4,STATION,,Categórico,"[Export Summary, Param_horarios_Estaciones]",0
5,DATETIME,,Fecha/Hora,"[2023-01-01T00:00:00, 2024-07-31T23:00:00]",13
6,SURESTE,,Categórico,"[CO, ppm, 2.37, 2.12, 2.05, 2.5, 1.94, 1.35, 1...",196
7,SURESTE.1,,Categórico,"[NO, ppb, 54.5, 38.7, 60.5, 42.3, 10.2, 7.8, 5...",377
8,SURESTE.2,,Categórico,"[NO2, ppb, 32.6, 30.3, 28.8, 29.1, 25.7, 23.1,...",164
9,SURESTE.3,,Categórico,"[NOX, ppb, 87.1, 68.9, 67.4, 89.4, 67.7, 33.2,...",165



Calidad de datos:
 - Registros, Columnas: (13883, 260)
 - Faltantes por columna (top 10): {'RAINF': 13883, 'UNNAMED: 0': 13883, 'UNNAMED: 224': 13883, 'UNNAMED: 16': 13883, 'UNNAMED: 208': 13883, 'UNNAMED: 32': 13883, 'UNNAMED: 192': 13883, 'O3': 13883, 'SO2': 13883, 'UNNAMED: 112': 13883}
 - Valores infinitos: False
 - Filas duplicadas totales: 4

=== 2) Preparación de los datos ===
Racional de selección:
 Se seleccionaron las columnas de tiempo (DATETIME), STATION y variables de medición estándar. Se excluyeron columnas auxiliares para evitar ruido y garantizar comparabilidad.
Duplicados eliminados (filas exactas): 11
Duplicados por clave (STATION, DATETIME): 0
Columnas objetivo sugeridas: ['PM2.5', 'PM10']

=== 3) Transformación de Datos ===
Escalado aplicado: {'scaler': 'standard', 'columns': ['PM10', 'PM2.5', 'O3', 'NO', 'NO2', 'NOx', 'SO2', 'CO', 'RH', 'WS', 'TEMP', 'SR', 'BP', 'WD', 'RAINF', 'WIND_U', 'WIND_V']}

=== 4) Reformateo / Exportación ===
✅ Exportado dataset completo:

  return df.groupby(["STATION","YEAR"], group_keys=False).apply(_proc)
  df = df.groupby("STATION", group_keys=False).apply(_interpolate_group)
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


📦 Exportaciones por año y por estación listas en: /content/clean

=== Resumen RÁPIDO ===
Filas/Columnas finales: (13870, 24)
NA totales: 235790
Columnas creadas (derivadas): ['WIND_U', 'WIND_V', 'WEEKDAY', 'IS_WEEKEND']
Listo ✅
