# 06 - Exportação Final

Este notebook realiza:
1. Seleção de colunas finais
2. Exportação para parquet:
   - consolidated_overral.parquet
   - consolidated_weights.parquet
   - consolidated_context.parquet
   - consolidated_normalized.parquet

In [None]:
import pandas as pd
import json
from pathlib import Path

BASE_DIR = Path("c:/jobs/botafogo/v3")
OUTPUT_DIR = BASE_DIR / "bases" / "outputs"

## 1. Carregar Dados

In [None]:
# Carregar dados com scores
df = pd.read_parquet(OUTPUT_DIR / "_temp_scouts_scored.parquet")
print(f"Dados carregados: {len(df)} jogadores, {len(df.columns)} colunas")

# Carregar pesos
df_weights = pd.read_parquet(OUTPUT_DIR / "_temp_weights_active.parquet")
print(f"Pesos carregados: {len(df_weights)} indicadores")

## 2. Exportar consolidated_overral.parquet

In [None]:
# Colunas principais conforme o plano
main_cols = [
    "player_id",
    "competition_id",
    "player_name",
    "competition_name",
    "team_name",
    "primary_position",
    "mapped_position",
    "position_group",
    "position_sub_group",
    "v_current",
    "overall_score",
    "rank_overall",
    "rank_position",
]

# Adicionar colunas de score por categoria
score_cols = [c for c in df.columns if c.startswith("score_")]
main_cols.extend(score_cols)

# Filtrar colunas disponíveis
available_main_cols = [c for c in main_cols if c in df.columns]
print(f"Colunas para overall: {len(available_main_cols)}")

In [None]:
# Criar DataFrame overall
df_overall = df[available_main_cols].copy()

# Ordenar por ranking
df_overall = df_overall.sort_values("rank_overall")

print(f"DataFrame overall: {len(df_overall)} linhas, {len(df_overall.columns)} colunas")
print(f"\nColunas: {df_overall.columns.tolist()}")

In [None]:
# Exportar
df_overall.to_parquet(OUTPUT_DIR / "consolidated_overral.parquet", index=False)
print(f"Exportado: {OUTPUT_DIR / 'consolidated_overral.parquet'}")

## 3. Exportar consolidated_weights.parquet

In [None]:
# Exportar tabela de pesos utilizada
df_weights.to_parquet(OUTPUT_DIR / "consolidated_weights.parquet", index=False)
print(f"Exportado: {OUTPUT_DIR / 'consolidated_weights.parquet'}")
print(f"  {len(df_weights)} indicadores")

## 4. Exportar consolidated_context.parquet

In [None]:
# Colunas de contexto/metadados
context_cols = [
    "player_id",
    "competition_id",
    "unique_key",
    "source_file",
    "v_current",
    "player_season_most_recent_match",
]

# Adicionar outras colunas de contexto disponíveis
extra_context = [c for c in df.columns if any([
    "player_" in c.lower(),
    "team_" in c.lower(),
    "competition_" in c.lower(),
    "season" in c.lower(),
]) and not c.endswith("_norm") and c not in main_cols]

context_cols.extend(extra_context)
available_context_cols = list(set([c for c in context_cols if c in df.columns]))

print(f"Colunas de contexto: {len(available_context_cols)}")

In [None]:
# Criar e exportar DataFrame de contexto
df_context = df[available_context_cols].copy()
df_context.to_parquet(OUTPUT_DIR / "consolidated_context.parquet", index=False)
print(f"Exportado: {OUTPUT_DIR / 'consolidated_context.parquet'}")
print(f"  {len(df_context)} linhas, {len(df_context.columns)} colunas")

## 5. Exportar consolidated_normalized.parquet

In [None]:
# Colunas de identificação + valores normalizados
id_cols = ["player_id", "competition_id", "unique_key", "mapped_position", "v_current"]
norm_cols = [c for c in df.columns if c.endswith("_norm")]

normalized_cols = [c for c in id_cols if c in df.columns] + norm_cols
print(f"Colunas normalizadas: {len(norm_cols)}")

In [None]:
# Criar e exportar DataFrame normalizado
df_normalized = df[normalized_cols].copy()
df_normalized.to_parquet(OUTPUT_DIR / "consolidated_normalized.parquet", index=False)
print(f"Exportado: {OUTPUT_DIR / 'consolidated_normalized.parquet'}")
print(f"  {len(df_normalized)} linhas, {len(df_normalized.columns)} colunas")

## 6. Limpar Arquivos Temporários

In [None]:
# Listar arquivos temporários
temp_files = list(OUTPUT_DIR.glob("_temp_*"))
print(f"Arquivos temporários: {len(temp_files)}")
for f in temp_files:
    print(f"  {f.name}")

In [None]:
# Descomente para remover arquivos temporários
# for f in temp_files:
#     f.unlink()
#     print(f"Removido: {f.name}")
# print("\nArquivos temporários removidos!")

## 7. Resumo Final

In [None]:
# Verificar arquivos exportados
final_files = [
    "consolidated_overral.parquet",
    "consolidated_weights.parquet",
    "consolidated_context.parquet",
    "consolidated_normalized.parquet",
]

print("=" * 60)
print("EXPORTAÇÃO CONCLUÍDA")
print("=" * 60)

for filename in final_files:
    filepath = OUTPUT_DIR / filename
    if filepath.exists():
        size_mb = filepath.stat().st_size / (1024 * 1024)
        df_check = pd.read_parquet(filepath)
        print(f"\n{filename}")
        print(f"  Tamanho: {size_mb:.2f} MB")
        print(f"  Linhas: {len(df_check):,}")
        print(f"  Colunas: {len(df_check.columns)}")
    else:
        print(f"\n{filename}: NÃO ENCONTRADO")

print("\n" + "=" * 60)

In [None]:
# Preview do arquivo principal
print("\nPreview - Top 10 Overall:")
df_final = pd.read_parquet(OUTPUT_DIR / "consolidated_overral.parquet")
display_cols = ["player_name", "mapped_position", "overall_score", "rank_overall"]
df_final.head(10)[display_cols]