In [1]:
#1) Configuración y utilidades
# Celda 1 — Config y helpers
from pathlib import Path
import shutil, datetime, os, json

# --- Config principal ---
REPO = Path.cwd()  # Ejecutar desde la raíz del repo
DRY_RUN = True     # Primero: simular. Luego ponelo en False para ejecutar.
STAMP = datetime.datetime.now().strftime("%Y%m%d")
ARCH = REPO / "archive" / f"ARCH_{STAMP}"

# Carpetas “doradas”
GOLD_DIRS = {
    "data": ["raw", "interim", "curated", "external"],
    "outputs": ["models", "predictions", "tiles", "visuals", "metrics"],
    "reports": ["experiments", "logs", "monitoring"],
}

# Objetos a limpiar/mover (enfoque simplificación radical)
NOTEBOOKS_MIRROR_DIRS = ["data","models","monitoring","notebooks","predictions","reports","tiles","visuals"]
H2O_PATTERNS = ["GBM_grid_", "StackedEnsemble_", "h2o-genmodel.jar"]
MODEL_FOLDERS_TO_MOVE_OUT = [("models/outputs", "outputs/models")]
ROOT_MONITORING = ("monitoring", "reports/monitoring")
DATA_LEGACY = [("data/processed", "data/interim"), ("data/training", "data/curated")]

# Archivos pesados/artefactos que suelen sobrar (se archivan)
LARGE_EXTS = [".zip", ".jar", ".pkl"]  # en models/
CATBOOST_ARTIFACTS_DIRS = ["notebooks/catboost_info"]
CATBOOST_ARTIFACTS_GLOBS = ["notebooks/catboost_*"]

# --- utilidades ---
actions = []

def ensure_dirs():
    for base, subs in GOLD_DIRS.items():
        for s in subs:
            (REPO / base / s).mkdir(parents=True, exist_ok=True)
    (ARCH / "models_misc").mkdir(parents=True, exist_ok=True)
    (ARCH / "notebooks_artifacts" / "catboost").mkdir(parents=True, exist_ok=True)
    (ARCH / "outputs_old").mkdir(parents=True, exist_ok=True)
    (ARCH / "root_misc").mkdir(parents=True, exist_ok=True)

def move_path(src: Path, dst: Path):
    """Mueve src→dst de forma segura (crea padres). Registra acción."""
    if not src.exists():
        return
    dst.parent.mkdir(parents=True, exist_ok=True)
    actions.append({"op":"move","from":str(src.relative_to(REPO)),"to":str(dst.relative_to(REPO))})
    if not DRY_RUN:
        shutil.move(str(src), str(dst))

def rsync_dir(src: Path, dst: Path, delete_src=True):
    if not src.exists():
        return
    dst.mkdir(parents=True, exist_ok=True)
    # mover todo el contenido
    for p in src.glob("**/*"):
        # respeta estructura relativa
        if p.is_file():
            rel = p.relative_to(src)
            target = dst / rel
            target.parent.mkdir(parents=True, exist_ok=True)
            actions.append({"op":"copy","from":str(p.relative_to(REPO)),"to":str(target.relative_to(REPO))})
            if not DRY_RUN:
                shutil.copy2(p, target)
    if delete_src:
        actions.append({"op":"rmdir","path":str(src.relative_to(REPO))})
        if not DRY_RUN:
            shutil.rmtree(src, ignore_errors=True)

def add_gitignore_lines(lines):
    gi = REPO / ".gitignore"
    existing = set()
    if gi.exists():
        existing = set(l.strip() for l in gi.read_text().splitlines() if l.strip())
    new_lines = []
    for ln in lines:
        if ln not in existing:
            new_lines.append(ln)
    if new_lines:
        actions.append({"op":"append_gitignore","lines":new_lines})
        if not DRY_RUN:
            with gi.open("a") as f:
                for ln in new_lines:
                    f.write(ln+"\n")

In [2]:
#2) Plan de limpieza (definición de operaciones)
# Celda 2 — Definir el plan (no ejecuta nada todavía)

ensure_dirs()

# 1) Normalizar data legacy → estándar
for src_str, dst_str in DATA_LEGACY:
    src, dst = REPO/src_str, REPO/dst_str
    if src.exists():
        move_path(src, dst)

# 2) models/outputs → outputs/models
for src_str, dst_str in MODEL_FOLDERS_TO_MOVE_OUT:
    src, dst = REPO/src_str, REPO/dst_str
    if src.exists():
        # mover contenido y borrar fuente
        rsync_dir(src, dst, delete_src=True)

# 3) monitoring (raíz) → reports/monitoring
src, dst = REPO/ROOT_MONITORING[0], REPO/ROOT_MONITORING[1]
if Path(src).exists():
    rsync_dir(Path(src), REPO/dst, delete_src=True)

# 4) notebooks/* espejos → mover a oficiales y borrar duplicados
for d in NOTEBOOKS_MIRROR_DIRS:
    src_dir = REPO/"notebooks"/d
    if src_dir.exists():
        target = None
        if d in ["predictions","tiles","visuals"]:
            target = REPO/"outputs"/d
        elif d in ["reports"]:
            target = REPO/"reports"
        elif d in ["models"]:
            target = REPO/"models"
        elif d in ["monitoring"]:
            target = REPO/"reports"/"monitoring"
        elif d in ["data"]:
            target = REPO/"data"
        elif d in ["notebooks"]:
            # bucle notebooks/notebooks → borrar
            actions.append({"op":"rmdir","path":str(src_dir.relative_to(REPO))})
            if not DRY_RUN:
                shutil.rmtree(src_dir, ignore_errors=True)
            continue
        if target:
            rsync_dir(src_dir, target, delete_src=True)

# 5) Artefactos CatBoost → archive
for d in CATBOOST_ARTIFACTS_DIRS:
    p = REPO/d
    if p.exists():
        move_path(p, ARCH/"notebooks_artifacts"/"catboost"/p.name)

for pattern in CATBOOST_ARTIFACTS_GLOBS:
    for p in REPO.glob(pattern):
        move_path(p, ARCH/"notebooks_artifacts"/"catboost"/p.name)

# 6) H2O/FLAML/PyCaret artefactos grandes en models/ → archive
models_dir = REPO/"models"
if models_dir.exists():
    for p in models_dir.iterdir():
        if p.is_file() and p.suffix.lower() in LARGE_EXTS:
            move_path(p, ARCH/"models_misc"/p.name)
        if p.is_dir():
            # mover directorios H2O típicos (GBM_grid_*, StackedEnsemble_*)
            if any(p.name.startswith(pref) for pref in H2O_PATTERNS if pref.endswith("_") or True):
                if p.name.startswith("GBM_grid_") or p.name.startswith("StackedEnsemble_"):
                    move_path(p, ARCH/"models_misc"/p.name)

# 7) Ignorar adecuadamente
add_gitignore_lines([
    ".DS_Store",
    ".ipynb_checkpoints/",
    "data/",
    "outputs/",
    "mlruns/",
    "archive/",
    "*.log"
])

# 8) Guardar un “informe” previo (acciones planificadas)
REPORT = REPO/"docs"/"dev"/f"PLAN_02_2_{STAMP}.json"
if not DRY_RUN:
    REPORT.write_text(json.dumps(actions, indent=2, ensure_ascii=False))
else:
    # En DRY_RUN, igualmente lo mostramos en pantalla
    print(f"Acciones planificadas (simulación): {len(actions)}")
    for a in actions[:30]:
        print(a)
    if len(actions) > 30:
        print(f"... (+{len(actions)-30} más)")

Acciones planificadas (simulación): 20
{'op': 'rmdir', 'path': 'notebooks/data'}
{'op': 'rmdir', 'path': 'notebooks/reports'}
{'op': 'move', 'from': 'notebooks/catboost_1760613551.673151', 'to': 'archive/ARCH_20251024/notebooks_artifacts/catboost/catboost_1760613551.673151'}
{'op': 'move', 'from': 'notebooks/catboost_1760613430.927209', 'to': 'archive/ARCH_20251024/notebooks_artifacts/catboost/catboost_1760613430.927209'}
{'op': 'move', 'from': 'notebooks/catboost_1760613579.323601', 'to': 'archive/ARCH_20251024/notebooks_artifacts/catboost/catboost_1760613579.323601'}
{'op': 'move', 'from': 'notebooks/catboost_1760613461.38568', 'to': 'archive/ARCH_20251024/notebooks_artifacts/catboost/catboost_1760613461.38568'}
{'op': 'move', 'from': 'notebooks/catboost_1760613474.00322', 'to': 'archive/ARCH_20251024/notebooks_artifacts/catboost/catboost_1760613474.00322'}
{'op': 'move', 'from': 'notebooks/catboost_1760613587.256147', 'to': 'archive/ARCH_20251024/notebooks_artifacts/catboost/catboos

In [3]:
# 3) Ejecutar de verdad (cambiar DRY_RUN = False y correr)
# Celda 3 — Ejecutar
DRY_RUN = False

# Re-ejecutar el mismo plan (para que aplique las acciones)
# Nota: repetimos las definiciones mínimas necesarias
actions_exec = []

def run_move_path(src: Path, dst: Path):
    if not src.exists(): 
        return
    dst.parent.mkdir(parents=True, exist_ok=True)
    actions_exec.append({"op":"move","from":str(src.relative_to(REPO)),"to":str(dst.relative_to(REPO))})
    shutil.move(str(src), str(dst))

def run_rsync_dir(src: Path, dst: Path, delete_src=True):
    if not src.exists():
        return
    dst.mkdir(parents=True, exist_ok=True)
    for p in src.glob("**/*"):
        if p.is_file():
            rel = p.relative_to(src)
            target = dst / rel
            target.parent.mkdir(parents=True, exist_ok=True)
            actions_exec.append({"op":"copy","from":str(p.relative_to(REPO)),"to":str(target.relative_to(REPO))})
            shutil.copy2(p, target)
    if delete_src:
        actions_exec.append({"op":"rmdir","path":str(src.relative_to(REPO))})
        shutil.rmtree(src, ignore_errors=True)

# 1) data legacy
for src_str, dst_str in DATA_LEGACY:
    src, dst = REPO/src_str, REPO/dst_str
    if src.exists():
        run_move_path(src, dst)

# 2) models/outputs → outputs/models
for src_str, dst_str in MODEL_FOLDERS_TO_MOVE_OUT:
    src, dst = REPO/src_str, REPO/dst_str
    if src.exists():
        run_rsync_dir(src, dst, delete_src=True)

# 3) monitoring raíz → reports/monitoring
src, dst = REPO/ROOT_MONITORING[0], REPO/ROOT_MONITORING[1]
if Path(src).exists():
    run_rsync_dir(Path(src), REPO/dst, delete_src=True)

# 4) notebooks espejos
for d in NOTEBOOKS_MIRROR_DIRS:
    src_dir = REPO/"notebooks"/d
    if src_dir.exists():
        target = None
        if d in ["predictions","tiles","visuals"]:
            target = REPO/"outputs"/d
        elif d in ["reports"]:
            target = REPO/"reports"
        elif d in ["models"]:
            target = REPO/"models"
        elif d in ["monitoring"]:
            target = REPO/"reports"/"monitoring"
        elif d in ["data"]:
            target = REPO/"data"
        elif d in ["notebooks"]:
            actions_exec.append({"op":"rmdir","path":str(src_dir.relative_to(REPO))})
            shutil.rmtree(src_dir, ignore_errors=True)
            continue
        if target:
            run_rsync_dir(src_dir, target, delete_src=True)

# 5) CatBoost → archive
for d in CATBOOST_ARTIFACTS_DIRS:
    p = REPO/d
    if p.exists():
        run_move_path(p, ARCH/"notebooks_artifacts"/"catboost"/p.name)

for pattern in CATBOOST_ARTIFACTS_GLOBS:
    for p in REPO.glob(pattern):
        run_move_path(p, ARCH/"notebooks_artifacts"/"catboost"/p.name)

# 6) models: artefactos grandes → archive
models_dir = REPO/"models"
if models_dir.exists():
    for p in models_dir.iterdir():
        if p.is_file() and p.suffix.lower() in LARGE_EXTS:
            run_move_path(p, ARCH/"models_misc"/p.name)
        if p.is_dir():
            if p.name.startswith("GBM_grid_") or p.name.startswith("StackedEnsemble_"):
                run_move_path(p, ARCH/"models_misc"/p.name)

# 7) .gitignore — apéndice
gi_lines = [
    ".DS_Store",
    ".ipynb_checkpoints/",
    "data/",
    "outputs/",
    "mlruns/",
    "archive/",
    "*.log"
]
gi = REPO / ".gitignore"
existing = set()
if gi.exists():
    existing = set(l.strip() for l in gi.read_text().splitlines() if l.strip())
with gi.open("a") as f:
    for ln in gi_lines:
        if ln not in existing:
            f.write(ln+"\n")

# 8) Guardar informe de ejecución
REPORT = REPO/"docs"/"dev"/f"APLICACION_02_2_{STAMP}.json"
with REPORT.open("w") as f:
    json.dump(actions_exec, f, indent=2, ensure_ascii=False)

print(f"✅ Limpieza aplicada. Acciones: {len(actions_exec)}\n→ Informe: {REPORT}")

✅ Limpieza aplicada. Acciones: 20
→ Informe: /Users/ri1965/Desktop/ecobici-automl/docs/dev/APLICACION_02_2_20251024.json


In [4]:
# 4) Post-chequeo (estructura y estado)
# Celda 4 — Verificación rápida
from subprocess import run, PIPE

def run_cmd(cmd):
    out = run(cmd, shell=True, stdout=PIPE, stderr=PIPE, text=True)
    return out.stdout.strip()

print("Árbol (nivel 2):")
print(run_cmd("python - << 'PY'\nfrom pathlib import Path\nimport os\nroot='.'\nfor p in sorted([d for d in Path(root).iterdir() if d.is_dir()]):\n    print('├──', p.name)\nPY"))

print("\nGit status:")
print(run_cmd("git status"))

print("\nIgnorados (resumen):")
print(run_cmd("git status --ignored -s | sed -n '1,120p'"))

Árbol (nivel 2):
├── .git
├── .ipynb_checkpoints
├── app
├── archive
├── config
├── data
├── docs
├── envs
├── models
├── notebooks
├── outputs
├── reports
├── scripts
├── src
├── tools

Git status:
En la rama master
Cambios no rastreados para el commit:
  (usa "git add/rm <archivo>..." para actualizar a lo que se le va a hacer commit)
  (usa "git restore <archivo>..." para descartar los cambios en el directorio de trabajo)
	modificados:     Makefile
	modificados:     docs/dev/ESTRUCTURA_ACTUAL.txt
	modificados:     docs/dev/repo_tree.txt
	borrados:        models/GBM_grid_1_AutoML_1_20251015_220355_model_14.zip
	borrados:        models/GBM_grid_1_AutoML_1_20251015_224927_model_1.zip
	borrados:        models/StackedEnsemble_AllModels_2_AutoML_2_20251015_231613.zip
	borrados:        models/best_model_ecobici_log.pkl
	borrados:        models/final_model_pycaret.pkl
	borrados:        models/final_model_pycaret_pipeline.pkl
	borrados:        models/flaml_automl_20251016_082101.pkl
	borrados