In [8]:
# etl_clean_load.py
import os
import re
import unicodedata
from datetime import datetime
import pandas as pd
import numpy as np

DATA_DIR = "data"
RAW_PATH = os.path.join(DATA_DIR, "raw.csv")
CLEAN_PATH = os.path.join(DATA_DIR, "clean.pkl")

os.makedirs(DATA_DIR, exist_ok=True)

if not os.path.exists(RAW_PATH):
    raise FileNotFoundError(f"No se encontró {RAW_PATH}. Ejecuta primero generate_synthetic.py")

# ------------------ helpers ------------------

def strip_accents(s: str) -> str:
    """Elimina acentos/diacríticos (útil para validar emails con unicode)."""
    s = str(s)
    return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))

def parse_amount(x):
    if pd.isna(x):
        return np.nan
    s = str(x)
    s = re.sub(r"[^0-9.,-]", "", s)  # quitar símbolos
    s = s.replace(",", ".")
    try:
        return float(s)
    except ValueError:
        return np.nan

def parse_date(x):
    for fmt in ("%Y-%m-%d", "%d-%m-%Y", "%d/%m/%Y"):
        try:
            return datetime.strptime(str(x), fmt)
        except Exception:
            continue
    return pd.NaT

# ------------------ carga cruda ------------------

df = pd.read_csv(RAW_PATH, dtype=str)

# ------------------ normalizaciones ------------------

df.columns = [c.strip().lower() for c in df.columns]
df["name"] = df["name"].astype(str).str.strip()
df["email"] = df["email"].astype(str).str.strip().str.lower()

# para validación quitamos acentos (sin modificar el email original)
email_for_validation = df["email"].apply(strip_accents)

df["country"] = (
    df["country"].astype(str).str.strip().str.upper()
    .replace({"NONE": np.nan, "NAN": np.nan, "": np.nan})
)

# user_id numérico
df["user_id"] = pd.to_numeric(df["user_id"], errors="coerce")

# amount y fechas
df["amount"] = df["amount"].apply(parse_amount)
df["created_at"] = df["created_at"].apply(parse_date)

# ------------------ validaciones ------------------

# Regex permisiva: acepta unicode en local/domain (sin espacios y con TLD >=2)
email_re = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]{2,}$")
df["email_valid"] = email_for_validation.apply(
    lambda e: bool(email_re.match(str(e))) if pd.notna(e) else False
)

# catálogo de países (los no incluidos se van a NaN)
valid_countries = {"PE", "MX", "CO", "AR"}
df["country"] = df["country"].apply(lambda c: c if c in valid_countries else np.nan)

# duplicados exactos
df = df.drop_duplicates(subset=["user_id", "email", "created_at"], keep="first")

# ------------------ reglas de calidad ------------------

# Estricto: user_id + fecha + email válido
filtered = df[(df["user_id"].notna()) & (df["created_at"].notna()) & (df["email_valid"])]

# Fallback si quedó vacío: relajar email para no devolver 0 filas
if filtered.empty:
    print("[AVISO] 0 filas tras filtro estricto. Relajando condición de email_valid…")
    filtered = df[(df["user_id"].notna()) & (df["created_at"].notna())]

# ------------------ guardar ------------------

filtered.to_pickle(CLEAN_PATH, protocol=4)
print(f"Datos limpios guardados en {CLEAN_PATH} — filas: {len(filtered)}")


Datos limpios guardados en data\clean.pkl — filas: 414
