In [None]:
# -*- coding: utf-8 -*-
"""
make_h_exposure.py
Script de conveniencia que llama al módulo h_exposure y guarda el CSV final.
"""
import sys
from pathlib import Path

notebook_path = Path().resolve()
sys.path.append(str(notebook_path.parent.parent))

from competitive_exposure import compute_h_exposure
from src.conf import config

if __name__ == "__main__":
    # Ajusta las rutas según tu estructura (ejemplo):
    TRAIN_CSV = config.DATA_DIR + "\\train_v2.csv"
    ITEMS_CSV = config.DATA_DIR + "\\items.csv"
    OUT_CSV   = config.DATA_DIR + "\\feature_h_exposure_v2.csv" # o "data/features_h_exposure.csv.gz"
    CHUNK_ROWS = 1_500_000

    # Opción A: binario = hay al menos un competidor en promo
    compute_h_exposure(
        train_csv=TRAIN_CSV, 
        items_csv=ITEMS_CSV,
        out_csv=OUT_CSV,
        chunk_rows=CHUNK_ROWS,
        bin_mode="any",          # "any" o "threshold"
        bin_tau=0.30,            # solo aplica si bin_mode="threshold"
        keep_class_in_output=False,
        compression=None,        # por ejemplo "gzip" para OUT_CSV+".gz"
    )

    print(f"Listo. CSV guardado en: {OUT_CSV}")

In [1]:
from pathlib import Path
import sys

# --- Localiza el repo raíz tomando el notebook como referencia ---
NB_DIR = Path.cwd()  # si abres el notebook desde src/preprocess_data, esto es esa carpeta
# Si el notebook se ejecuta desde otra cwd, ajusta buscando 'src' y 'data'
if (NB_DIR / "preprocess.ipynb").exists():
    SRC_DIR = NB_DIR
else:
    # intenta encontrar src/preprocess_data en los ancestros
    for p in [NB_DIR] + list(NB_DIR.parents):
        if (p / "src" / "preprocess_data").exists():
            SRC_DIR = p / "src" / "preprocess_data"
            break
    else:
        SRC_DIR = NB_DIR  # fallback

REPO_ROOT = SRC_DIR.parents[1] if SRC_DIR.name == "preprocess_data" else SRC_DIR.parent
sys.path.append(str(SRC_DIR))      # para importar módulos que coloques junto al notebook
sys.path.append(str(REPO_ROOT))    # por si dejas el módulo en la raíz del repo

# --- Rutas de entrada/salida ---
# Tu H ya calculado (probamos con y sin punto inicial)
H_CANDIDATES = [
    REPO_ROOT / "data" / "raw_data" / "feature_h_exposure.csv",
    REPO_ROOT / ".data" / "raw_data" / "feature_h_exposure.csv",
]
for _p in H_CANDIDATES:
    if _p.exists():
        H_CSV = _p
        break
else:
    raise FileNotFoundError("No encontré feature_h_exposure.csv en data/raw_data o .data/raw_data")

TRAIN_CSV  = REPO_ROOT / ".data" / "raw_data" / "train_v2.csv"
ITEMS_CSV  = REPO_ROOT / ".data" / "raw_data" / "items.csv"
STORES_CSV = REPO_ROOT / ".data" / "raw_data" / "stores.csv"

OUT_DIR = REPO_ROOT / "data" / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

H_CSV, TRAIN_CSV, ITEMS_CSV, STORES_CSV, OUT_DIR

(WindowsPath('d:/repos/cannibalization_reatail/.data/raw_data/feature_h_exposure.csv'),
 WindowsPath('d:/repos/cannibalization_reatail/.data/raw_data/train_v2.csv'),
 WindowsPath('d:/repos/cannibalization_reatail/.data/raw_data/items.csv'),
 WindowsPath('d:/repos/cannibalization_reatail/.data/raw_data/stores.csv'),
 WindowsPath('d:/repos/cannibalization_reatail/data/processed'))

In [None]:
# Importa el módulo (colócalo en src/preprocess_data o en la raíz del repo)
import select_pairs_and_donors as spd

# (Opcional) sobreescribe parámetros sin tocar el archivo
spd.N_CANNIBALS = 4
spd.N_VICTIMS_PER_I = 5

spd.N_DONORS_PER_J = 20
spd.PRE_DAYS, spd.PRE_GAP, spd.TREAT_MAX, spd.POST_DAYS = 60, 7, 14, 30
spd.MIN_ITEM_OBS, spd.H_SD_MIN = 200, 0.004
spd.P_ANY_MIN, spd.P_ANY_MAX = 0.02, 0.98
spd.P_PROMO_I_MIN, spd.P_PROMO_I_MAX, spd.P_PROMO_J_MAX = 0.03, 0.25, 0.10

pairs_path, donors_path = spd.select_pairs_and_donors(
    H_csv=str(H_CSV),
    train_csv=str(TRAIN_CSV),
    items_csv=str(ITEMS_CSV),
    stores_csv=str(STORES_CSV),
    outdir=str(OUT_DIR),
    
)



[2025-10-13 06:23:30][MainProcess][INFO] === Inicio select_pairs_and_donors (optimizado V2) ===
[2025-10-13 06:23:30][MainProcess][INFO] Parámetros: N_WORKERS=6, CHUNK_H=4,000,000, CHUNK_TRAIN=4,000,000
[2025-10-13 06:23:30][MainProcess][INFO] ENV sampling: SPD_SAMPLE_CI_PER_SC=50, SPD_SAMPLE_FRAC_CI=0.25, SPD_MAX_TASKS_SCORE=20000
[2025-10-13 06:23:30][MainProcess][INFO] [PROGRESS] Inicializando insumos ...
[2025-10-13 06:23:30][MainProcess][INFO] Leyendo items desde d:\repos\cannibalization_reatail\.data\raw_data\items.csv ...
[2025-10-13 06:23:30][MainProcess][INFO] Items: 4,100 filas (0.0 MB) en 0.01s
[2025-10-13 06:23:30][MainProcess][INFO] Leyendo stores desde d:\repos\cannibalization_reatail\.data\raw_data\stores.csv ...
[2025-10-13 06:23:30][MainProcess][INFO] Stores: 54 filas (0.0 MB) en 0.00s
[2025-10-13 06:23:30][MainProcess][INFO] H no contiene 'class'. Generando archivo temporal con 'class' ...
[2025-10-13 06:23:45][MainProcess][INFO] [PROGRESS] Anexando 'class' a H: 4,000

In [None]:
# En tu script principal:
import sys
from pathlib import Path
import pandas as pd

notebook_path = Path().resolve()
sys.path.append(str(notebook_path.parent.parent))

from src.conf import config
from src.preprocess_data.feature_engineering import build_and_chunk



raw = {
    "ventas": pd.read_csv(config.DATA_DIR + "\\train.csv"),
    "items": pd.read_csv(config.DATA_DIR + "\\items.csv"),
    "stores": pd.read_csv(config.DATA_DIR + "\\stores.csv"),
    "trans": pd.read_csv(config.DATA_DIR + "\\transactions.csv"),
    "oil": pd.read_csv(config.DATA_DIR + "\\oil.csv"),
    "hol": pd.read_csv(config.DATA_DIR + "\\holidays_events.csv"),
}
chunks = build_and_chunk(raw)  # {'A': dfA, 'B': dfB, 'C': dfC, 'merged_core_only': df}

# Panel diario completo (núcleos recombinados, sin inconsistencias)
panel_daily = chunks["merged_core_only"]

In [None]:
str(notebook_path.parent.parent)

In [None]:
notebook_path

In [None]:

import sys
from pathlib import Path

notebook_path = Path().resolve()
sys.path.append(str(notebook_path.parent.parent))
from src.preprocess_data.competitive_exposure import build_competitive_exposure
import pandas as pd
from src.conf import config
import os

# 1) Panel base
# panel = build_full_panel(raw)
panel = pd.read_csv(os.path.join(config.OUTPUT_DIR, "panel_semanal.csv"))

# 2) Exposición competitiva (ahora sí tendrás E_cat_isw y E_bin_isw)
panel = build_competitive_exposure(
    panel,
    scope_col="family",          # o "class" si prefieres más granular
    exclude_self=True,           # canibalización "pura"

    
    theta_E=config.THETA_E_CAT,  # p.ej., 0.2
    compute_delta_version=True   # añade E_cat_delta_isw además de E_cat_isw
)

panel.to_csv(os.path.join(config.OUTPUT_DIR, "panel_semanal_competitive.csv"), index=False)
