# Análise Completa - Case Ifood: Teste A/B Estratégia de Cupons

Notebook único para orquestrar as tarefas de execução de *setup*, **ETL** e análise dos dados, integrando os diferentes módulos do repositório de origem:

- Clona/atualiza o repositório do projeto, com as dependências, no Colab
- Instala dependências e faz o **download** dos dados brutos
- Sobe Spark e executa o **ETL** (orders/consumers/restaurants + mapa A/B)
- Mantém `orders_silver` e `users_silver` em memória
- Realiza a análise exploratória dos dados


## Configuração de Ambiente e Download de Dados Brutos

In [None]:
import os, sys, subprocess
from pathlib import Path

GITHUB_USER = "silvaniacorreia"
REPO_NAME   = "ifood-case-cupons"
REPO_URL    = f"https://github.com/{GITHUB_USER}/{REPO_NAME}.git"
assert "COLAB_RELEASE_TAG" in os.environ or "COLAB_GPU" in os.environ, "Este notebook foi preparado para executar no Google Colab."

def run(cmd):
    print(">", " ".join(cmd))
    subprocess.check_call(cmd)

# 1) clonar/atualizar repositório
ROOT = Path("/content")
PROJECT_DIR = ROOT / REPO_NAME
if not PROJECT_DIR.exists():
    run(["git", "clone", REPO_URL, str(PROJECT_DIR)])
else:
    os.chdir(PROJECT_DIR)
    run(["git", "fetch", "--all"])
    run(["git", "checkout", "main"])
    run(["git", "pull", "--rebase", "origin", "main"])
os.chdir(PROJECT_DIR)

# 2) deps + download programático (idempotentes)
run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
run([sys.executable, "scripts/download_data.py"])

# 3) sys.path
if str(PROJECT_DIR) not in sys.path:
    sys.path.insert(0, str(PROJECT_DIR))
print("✔️ Bootstrap concluído. Projeto:", PROJECT_DIR)


## Iniciando o Spark

In [None]:
from src.utils import load_settings, get_spark

s = load_settings()
spark = get_spark(
    app_name=s.runtime.spark.app_name,
    shuffle_partitions=s.runtime.spark.shuffle_partitions,
    extra_conf=getattr(s.runtime.spark, "conf", {}) 
)
print("✔️ Spark ativo - versão:", spark.version)

# checagem rápida
spark.range(5).show()


## Análises Pré-Flight

In [None]:
## Pré-flight (checagens dados brutos)
from src.checks import preflight
from pprint import pprint

rep = preflight(s.data.raw_dir, strict=False)
print("Pré-flight (resumo):")
pprint({
    "raw_dir": rep["raw_dir"],
    "orders_format_guess": rep["orders_format_guess"],
    "files": {k: {kk: vv for kk, vv in v.items() if kk in ("exists","size_bytes","gzip_ok","tar_ok")} for k, v in rep["files"].items()},
    "ab_csv_candidates": rep["ab_csv_candidates"][:3],
})


## ETL (Extração, Transformação e Carga)

In [None]:
from src import etl, checks
from pyspark.sql import functions as F
import importlib, os

# 1) ler brutos
orders, consumers, restaurants, abmap = etl.load_raw(spark, s.data.raw_dir)
checks.profile_loaded(orders, consumers, restaurants, abmap, n=5)

# 2) conform + joins + janela
df = etl.clean_and_conform(
    orders, consumers, restaurants, abmap,
    business_tz=getattr(s.analysis, "business_tz", "America/Sao_Paulo"),
    treat_is_target_null_as_control=getattr(s.analysis, "treat_is_target_null_as_control", False),
    experiment_start=getattr(getattr(s, "analysis", {}), "experiment_window", {}).get("start") if getattr(s, "analysis", None) and getattr(s.analysis, "experiment_window", None) else None,
    experiment_end=getattr(getattr(s, "analysis", {}), "experiment_window", {}).get("end") if getattr(s, "analysis", None) and getattr(s.analysis, "experiment_window", None) else None,
    auto_infer_window=getattr(s.analysis, "auto_infer_window", True),
)

# 3) silvers em memória
orders_silver = etl.build_orders_silver(df)
users_silver  = etl.build_user_aggregates(orders_silver)

# recency com base no último timestamp observado
ref_ts = orders_silver.agg(F.max("event_ts_utc")).first()[0]
users_silver = users_silver.withColumn(
    "recency",
    F.when(F.col("last_order").isNotNull(), F.datediff(F.lit(ref_ts), F.col("last_order")))
)

# 4) salvar parquet localmente
SAVE_PARQUET = True
if SAVE_PARQUET:
    spark.catalog.clearCache()
    orders_silver.coalesce(4).write.mode("overwrite").parquet(f"{s.data.processed_dir}/orders_silver.parquet")
    users_silver.coalesce(4).write.mode("overwrite").parquet(f"{s.data.processed_dir}/users_silver.parquet")

print("orders_silver:", orders_silver.count(), "linhas")
print("users_silver :", users_silver.count(), "linhas")


## Checagem dos Dados

In [None]:
# --- Checks essenciais (nulos, janela, split A/B) ---
from pyspark.sql import functions as F

def nulls_by_col(df):
    exprs = [F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]
    return df.select(exprs)

print("Faixa de datas (UTC) em orders_silver:")
orders_silver.agg(F.min("event_ts_utc").alias("min_utc"),
                  F.max("event_ts_utc").alias("max_utc")).show()

print("Split A/B (users):")
users_silver.groupBy("is_target").count().show()

print("Nulos em orders_silver:")
nulls_by_col(orders_silver).show(truncate=False)

# previews
try:
    from IPython.display import display
    display(orders_silver.limit(5).toPandas())
    display(users_silver.limit(5).toPandas())
except Exception:
    print("Preview (head) orders_silver:", orders_silver.limit(5).toPandas().head())
    print("Preview (head) users_silver :", users_silver.limit(5).toPandas().head())
