In [None]:
"""
ETL Pipeline - PokéAPI
Arquitectura Medallion: Bronze → Silver → Gold
"""

import requests
import pandas as pd
import json
from datetime import datetime
from typing import Dict, List
import time

In [None]:


# ============================================
# CAPA BRONZE - Extracción de datos raw
# ============================================

class BronzeLayer:
    """Extrae datos crudos de la API y los almacena sin transformar"""

    def __init__(self):
        self.base_url = "https://pokeapi.co/api/v2"
        self.bronze_data = {}

    def extract_pokemon_data(self, limit: int = 50) -> Dict:
        """Extrae información básica de pokémon"""
        print(f"🔹 BRONZE: Extrayendo datos de {limit} pokémon...")

        try:
            # Obtener lista de pokémon
            response = requests.get(f"{self.base_url}/pokemon?limit={limit}")
            response.raise_for_status()
            pokemon_list = response.json()['results']

            # Extraer detalles de cada pokémon
            pokemon_details = []
            for i, pokemon in enumerate(pokemon_list, 1):
                print(f"  Extrayendo {i}/{limit}: {pokemon['name']}", end='\r')
                detail_response = requests.get(pokemon['url'])
                if detail_response.status_code == 200:
                    pokemon_details.append(detail_response.json())
                time.sleep(0.1)  # Rate limiting

            print("\n✅ BRONZE: Extracción completada")

            self.bronze_data = {
                'extraction_timestamp': datetime.now().isoformat(),
                'source': 'pokeapi.co',
                'pokemon_count': len(pokemon_details),
                'raw_data': pokemon_details
            }

            return self.bronze_data

        except requests.exceptions.RequestException as e:
            print(f"❌ Error en extracción: {e}")
            return {}

    def save_bronze(self, filename: str = 'pokemon_bronze.json'):
        """Guarda datos bronze en formato JSON"""
        with open(filename, 'w') as f:
            json.dump(self.bronze_data, f, indent=2)
        print(f"💾 BRONZE: Datos guardados en {filename}")


In [None]:
# ============================================
# CAPA SILVER - Limpieza y normalización
# ============================================

class SilverLayer:
    """Transforma y limpia los datos del bronze layer"""

    def __init__(self, bronze_data: Dict):
        self.bronze_data = bronze_data
        self.silver_dfs = {}

    def transform_pokemon_base(self) -> pd.DataFrame:
        """Transforma datos base de pokémon"""
        print("\n🔸 SILVER: Transformando datos base...")

        raw_data = self.bronze_data.get('raw_data', [])

        pokemon_base = []
        for pokemon in raw_data:
            pokemon_base.append({
                'pokemon_id': pokemon['id'],
                'name': pokemon['name'].title(),
                'height': pokemon['height'] / 10,  # Convertir a metros
                'weight': pokemon['weight'] / 10,  # Convertir a kg
                'base_experience': pokemon['base_experience'],
                'is_default': pokemon['is_default']
            })

        df = pd.DataFrame(pokemon_base)
        print(f"✅ SILVER: {len(df)} pokémon procesados")
        return df

    def transform_pokemon_types(self) -> pd.DataFrame:
        """Extrae y normaliza tipos de pokémon"""
        print("🔸 SILVER: Transformando tipos...")

        raw_data = self.bronze_data.get('raw_data', [])

        types_data = []
        for pokemon in raw_data:
            for type_info in pokemon['types']:
                types_data.append({
                    'pokemon_id': pokemon['id'],
                    'pokemon_name': pokemon['name'].title(),
                    'type': type_info['type']['name'].title(),
                    'slot': type_info['slot']
                })

        df = pd.DataFrame(types_data)
        print(f"✅ SILVER: {len(df)} relaciones de tipo procesadas")
        return df

    def transform_pokemon_stats(self) -> pd.DataFrame:
        """Extrae y normaliza estadísticas"""
        print("🔸 SILVER: Transformando estadísticas...")

        raw_data = self.bronze_data.get('raw_data', [])

        stats_data = []
        for pokemon in raw_data:
            for stat in pokemon['stats']:
                stats_data.append({
                    'pokemon_id': pokemon['id'],
                    'pokemon_name': pokemon['name'].title(),
                    'stat_name': stat['stat']['name'].replace('-', ' ').title(),
                    'base_stat': stat['base_stat'],
                    'effort': stat['effort']
                })

        df = pd.DataFrame(stats_data)
        print(f"✅ SILVER: {len(df)} estadísticas procesadas")
        return df

    def transform_pokemon_abilities(self) -> pd.DataFrame:
        """Extrae y normaliza habilidades"""
        print("🔸 SILVER: Transformando habilidades...")

        raw_data = self.bronze_data.get('raw_data', [])

        abilities_data = []
        for pokemon in raw_data:
            for ability in pokemon['abilities']:
                abilities_data.append({
                    'pokemon_id': pokemon['id'],
                    'pokemon_name': pokemon['name'].title(),
                    'ability': ability['ability']['name'].replace('-', ' ').title(),
                    'is_hidden': ability['is_hidden'],
                    'slot': ability['slot']
                })

        df = pd.DataFrame(abilities_data)
        print(f"✅ SILVER: {len(df)} habilidades procesadas")
        return df

    def process_all(self) -> Dict[str, pd.DataFrame]:
        """Procesa todas las transformaciones"""
        self.silver_dfs = {
            'pokemon_base': self.transform_pokemon_base(),
            'pokemon_types': self.transform_pokemon_types(),
            'pokemon_stats': self.transform_pokemon_stats(),
            'pokemon_abilities': self.transform_pokemon_abilities()
        }
        return self.silver_dfs

    def save_silver(self, prefix: str = 'pokemon_silver'):
        """Guarda los dataframes silver como CSV"""
        for name, df in self.silver_dfs.items():
            filename = f"{prefix}_{name}.csv"
            df.to_csv(filename, index=False)
            print(f"💾 SILVER: {name} guardado en {filename}")

In [None]:
# ============================================
# CAPA GOLD - Agregaciones y análisis
# ============================================

class GoldLayer:
    """Crea datasets optimizados para análisis"""

    def __init__(self, silver_dfs: Dict[str, pd.DataFrame]):
        self.silver_dfs = silver_dfs
        self.gold_dfs = {}

    def create_pokemon_summary(self) -> pd.DataFrame:
        """Crea resumen completo de pokémon con métricas agregadas"""
        print("\n🔶 GOLD: Creando resumen de pokémon...")

        base = self.silver_dfs['pokemon_base'].copy()

        # Agregar tipos (concatenar múltiples tipos)
        types_pivot = self.silver_dfs['pokemon_types'].groupby('pokemon_id')['type'].apply(
            lambda x: ' / '.join(sorted(x))
        ).reset_index()
        types_pivot.columns = ['pokemon_id', 'types']

        # Agregar estadísticas (calcular total y promedio)
        stats = self.silver_dfs['pokemon_stats'].copy()
        stats_agg = stats.groupby('pokemon_id').agg({
            'base_stat': ['sum', 'mean', 'max']
        }).reset_index()
        stats_agg.columns = ['pokemon_id', 'total_stats', 'avg_stat', 'max_stat']

        # Contar habilidades
        abilities_count = self.silver_dfs['pokemon_abilities'].groupby('pokemon_id').size().reset_index()
        abilities_count.columns = ['pokemon_id', 'abilities_count']

        # Merge todo
        summary = base.merge(types_pivot, on='pokemon_id', how='left')
        summary = summary.merge(stats_agg, on='pokemon_id', how='left')
        summary = summary.merge(abilities_count, on='pokemon_id', how='left')

        # Agregar categorías
        summary['size_category'] = pd.cut(
            summary['height'],
            bins=[0, 1, 2, float('inf')],
            labels=['Small', 'Medium', 'Large']
        )

        summary['power_level'] = pd.cut(
            summary['total_stats'],
            bins=[0, 300, 400, 500, float('inf')],
            labels=['Low', 'Medium', 'High', 'Elite']
        )

        print(f"✅ GOLD: Resumen creado con {len(summary)} pokémon")
        return summary

    def create_type_analysis(self) -> pd.DataFrame:
        """Análisis por tipo de pokémon"""
        print("🔶 GOLD: Creando análisis por tipo...")

        types = self.silver_dfs['pokemon_types'].copy()
        base = self.silver_dfs['pokemon_base'].copy()

        # Merge con datos base
        type_data = types.merge(base, on='pokemon_id', how='left')

        # Análisis agregado por tipo
        type_analysis = type_data.groupby('type').agg({
            'pokemon_id': 'count',
            'height': 'mean',
            'weight': 'mean',
            'base_experience': 'mean'
        }).reset_index()

        type_analysis.columns = [
            'type', 'pokemon_count', 'avg_height_m', 'avg_weight_kg', 'avg_experience'
        ]

        type_analysis = type_analysis.sort_values('pokemon_count', ascending=False)

        print(f"✅ GOLD: Análisis de {len(type_analysis)} tipos creado")
        return type_analysis

    def create_stats_pivot(self) -> pd.DataFrame:
        """Tabla pivote de estadísticas por pokémon"""
        print("🔶 GOLD: Creando pivot de estadísticas...")

        stats = self.silver_dfs['pokemon_stats'].copy()

        stats_pivot = stats.pivot_table(
            index=['pokemon_id', 'pokemon_name'],
            columns='stat_name',
            values='base_stat',
            aggfunc='first'
        ).reset_index()

        # Calcular total
        stat_cols = [col for col in stats_pivot.columns if col not in ['pokemon_id', 'pokemon_name']]
        stats_pivot['Total'] = stats_pivot[stat_cols].sum(axis=1)

        print(f"✅ GOLD: Pivot creado con {len(stats_pivot)} pokémon")
        return stats_pivot

    def process_all(self) -> Dict[str, pd.DataFrame]:
        """Procesa todas las agregaciones"""
        self.gold_dfs = {
            'pokemon_summary': self.create_pokemon_summary(),
            'type_analysis': self.create_type_analysis(),
            'stats_pivot': self.create_stats_pivot()
        }
        return self.gold_dfs

    def save_gold(self, prefix: str = 'pokemon_gold'):
        """Guarda los dataframes gold como CSV"""
        for name, df in self.gold_dfs.items():
            filename = f"{prefix}_{name}.csv"
            df.to_csv(filename, index=False)
            print(f"💾 GOLD: {name} guardado en {filename}")

In [None]:


# ============================================
# PIPELINE PRINCIPAL
# ============================================

def run_etl_pipeline(pokemon_limit: int = 50):
    """Ejecuta el pipeline ETL completo"""

    print("=" * 60)
    print("🚀 INICIANDO ETL PIPELINE - POKÉMON")
    print("=" * 60)

    start_time = time.time()

    # BRONZE: Extracción
    bronze = BronzeLayer()
    bronze_data = bronze.extract_pokemon_data(limit=pokemon_limit)
    bronze.save_bronze()

    if not bronze_data:
        print("❌ Pipeline abortado: No se pudieron extraer datos")
        return

    # SILVER: Transformación
    silver = SilverLayer(bronze_data)
    silver_dfs = silver.process_all()
    silver.save_silver()

    # GOLD: Agregación
    gold = GoldLayer(silver_dfs)
    gold_dfs = gold.process_all()
    gold.save_gold()

    # Resumen final
    elapsed_time = time.time() - start_time

    print("\n" + "=" * 60)
    print("✨ ETL PIPELINE COMPLETADO")
    print("=" * 60)
    print(f"⏱️  Tiempo de ejecución: {elapsed_time:.2f} segundos")
    print(f"📊 Pokémon procesados: {len(silver_dfs['pokemon_base'])}")
    print(f"📈 Datasets generados:")
    print(f"   - Bronze: 1 archivo JSON")
    print(f"   - Silver: {len(silver_dfs)} archivos CSV")
    print(f"   - Gold: {len(gold_dfs)} archivos CSV")
    print("=" * 60)

    return {
        'bronze': bronze_data,
        'silver': silver_dfs,
        'gold': gold_dfs
    }


# ============================================
# EJECUCIÓN Y VISUALIZACIÓN
# ============================================

if __name__ == "__main__":
    # Ejecutar pipeline
    results = run_etl_pipeline(pokemon_limit=50)

    # Mostrar ejemplos de cada capa
    if results:
        print("\n📋 EJEMPLOS DE DATOS:")
        print("\n--- SILVER: Pokemon Base ---")
        print(results['silver']['pokemon_base'].head())

        print("\n--- GOLD: Pokemon Summary ---")
        print(results['gold']['pokemon_summary'].head())

        print("\n--- GOLD: Type Analysis ---")
        print(results['gold']['type_analysis'].head(10))

        # Estadísticas finales
        print("\n📊 ESTADÍSTICAS GENERALES:")
        summary = results['gold']['pokemon_summary']
        print(f"Total de pokémon: {len(summary)}")
        print(f"Altura promedio: {summary['height'].mean():.2f} m")
        print(f"Peso promedio: {summary['weight'].mean():.2f} kg")
        print(f"Experiencia base promedio: {summary['base_experience'].mean():.0f}")
        print(f"\nDistribución por nivel de poder:")
        print(summary['power_level'].value_counts())