# Análise Completa - Case Ifood: Teste A/B Estratégia de Cupons

Notebook **único** para executar o *setup* (Colab ou local), **ETL** e análise dos dados.

- Clona/atualiza o repositório (no Colab)
- Instala dependências e faz o **download** dos dados brutos
- Sobe Spark e executa o **ETL** (orders/consumers/restaurants + mapa A/B)
- Mantém `orders_silver` e `users_silver` em memória
- Realiza a análise exploratória dos dados

> Dica: por padrão **não** salva Parquet (evita `winutils.exe` no Windows). Se quiser materializar, ative `SAVE_PARQUET=True` mais abaixo.


## COnfiguração de Ambiente e Download de Dados Brutos

In [None]:
# --- Bootstrap (Colab + Local) ---
import os, sys, subprocess, shutil
from pathlib import Path

GITHUB_USER = "silvaniacorreia"
REPO_NAME   = "ifood-case-cupons"
REPO_URL    = f"https://github.com/{GITHUB_USER}/{REPO_NAME}.git"

IN_COLAB = "COLAB_RELEASE_TAG" in os.environ or "COLAB_GPU" in os.environ
PERSIST_ON_DRIVE = False  

def run(cmd):
    print(">", " ".join(cmd))
    subprocess.check_call(cmd)

def find_project_root(start: Path = None) -> Path:
    start = start or Path.cwd().resolve()
    for p in [start] + list(start.parents):
        if (p / "requirements.txt").exists() and (p / "src" / "utils.py").exists():
            return p
    return start

if IN_COLAB:
    # 1) clonar/atualizar repo
    CONTENT_DIR = Path("/content")
    PROJECT_DIR = CONTENT_DIR / REPO_NAME
    if not PROJECT_DIR.exists():
        run(["git", "clone", REPO_URL, str(PROJECT_DIR)])
    else:
        os.chdir(PROJECT_DIR)
        run(["git", "fetch", "--all"])
        run(["git", "checkout", "main"])
        run(["git", "pull", "--rebase", "origin", "main"])
    os.chdir(PROJECT_DIR)

    # 2) deps + download 
    run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
    run([sys.executable, "scripts/download_data.py"])

    # 3) sys.path
    if str(PROJECT_DIR) not in sys.path:
        sys.path.insert(0, str(PROJECT_DIR))
    print("✔️ Bootstrap concluído (Colab). Projeto:", PROJECT_DIR)

else:
    PROJECT_DIR = find_project_root(Path.cwd().resolve())
    os.chdir(PROJECT_DIR)
    if str(PROJECT_DIR) not in sys.path:
        sys.path.insert(0, str(PROJECT_DIR))
    print("Execução local. Raiz do projeto:", PROJECT_DIR)


## Teste Smoke do Spark

In [None]:
from src.utils import load_settings, get_spark

s = load_settings()
spark = get_spark(
    app_name=s.runtime.spark.app_name,
    shuffle_partitions=s.runtime.spark.shuffle_partitions
)
print("✔️ Spark ativo - versão:", spark.version)

# checagem rápida
spark.range(5).show()


## ETL end-to-end (mantém DataFrames em memória) ---

In [None]:
from src import etl
from pyspark.sql import functions as F
import importlib, os

# 1) ler brutos
orders, consumers, restaurants, abmap = etl.load_raw(spark, s.data.raw_dir)

# 2) parâmetros da análise (do settings, se houver)
win = getattr(s.analysis, "experiment_window", None)
start = win.get("start") if isinstance(win, dict) else None
end   = win.get("end")   if isinstance(win, dict) else None
auto  = getattr(s.analysis, "auto_infer_window", True)

# 3) conform + joins + janela
df = etl.clean_and_conform(
    orders, consumers, restaurants, abmap,
    business_tz=getattr(s.analysis, "business_tz", "America/Sao_Paulo"),
    treat_is_target_null_as_control=getattr(s.analysis, "treat_is_target_null_as_control", False),
    experiment_start=start,
    experiment_end=end,
    auto_infer_window=auto,
)

# 4) silvers em memória
orders_silver = etl.build_orders_silver(df)
users_silver  = etl.build_user_aggregates(orders_silver)

# recency com base no último timestamp observado
ref_ts = orders_silver.agg(F.max("event_ts_utc")).first()[0]
users_silver = users_silver.withColumn(
    "recency",
    F.when(F.col("last_order").isNotNull(), F.datediff(F.lit(ref_ts), F.col("last_order")))
)

# 5) (opcional) salvar parquet localmente
SAVE_PARQUET = True  
if SAVE_PARQUET:
    orders_silver.write.mode("overwrite").parquet(f"{s.data.processed_dir}/orders_silver.parquet")
    users_silver.write.mode("overwrite").parquet(f"{s.data.processed_dir}/users_silver.parquet")

print("orders_silver:", orders_silver.count(), "linhas")
print("users_silver :", users_silver.count(), "linhas")


## Checagem dos Dados

In [None]:
# --- Checks essenciais (nulos, janela, split A/B) ---
from pyspark.sql import functions as F

def nulls_by_col(df):
    exprs = [F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]
    return df.select(exprs)

print("Faixa de datas (UTC) em orders_silver:")
orders_silver.agg(F.min("event_ts_utc").alias("min_utc"),
                  F.max("event_ts_utc").alias("max_utc")).show()

print("Split A/B (users):")
users_silver.groupBy("is_target").count().show()

print("Nulos em orders_silver:")
nulls_by_col(orders_silver).show(truncate=False)

# previews
try:
    from IPython.display import display
    display(orders_silver.limit(5).toPandas())
    display(users_silver.limit(5).toPandas())
except Exception:
    print("Preview (head) orders_silver:", orders_silver.limit(5).toPandas().head())
    print("Preview (head) users_silver :", users_silver.limit(5).toPandas().head())
