# üìä ETL con PySpark

Proceso ETL con PySpark para procesamiento distribuido de datos.


In [None]:
# Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import sqlite3
import pandas as pd
import os

# Crear SparkSession
spark = SparkSession.builder \
    .appName("GymLiftersETL") \
    .config("spark.sql.warehouse.dir", "warehouse") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print(f"‚úÖ SparkSession creada - Version: {spark.version}")


## Extracci√≥n - Cargar Dataset

Cargar el dataset limpio generado con Pandas.


In [None]:
# Cargar dataset limpio (buscar la versi√≥n m√°s reciente)
import glob

data_path = None

# Buscar archivos con patr√≥n gym_lifters_clean_*.csv o gym_lifters_clean.csv
pattern1 = "../data/gym_lifters_clean_*.csv"
pattern2 = "data/gym_lifters_clean_*.csv"
files1 = glob.glob(pattern1)
files2 = glob.glob(pattern2)
all_files = files1 + files2

if all_files:
    # Ordenar por fecha de modificaci√≥n y tomar el m√°s reciente
    all_files.sort(key=os.path.getmtime, reverse=True)
    data_path = all_files[0]
    print(f"üìÇ Archivo encontrado: {data_path}")
else:
    # Buscar archivo sin numeraci√≥n (versi√≥n antigua)
    if os.path.exists("../data/gym_lifters_clean.csv"):
        data_path = "../data/gym_lifters_clean.csv"
    elif os.path.exists("data/gym_lifters_clean.csv"):
        data_path = "data/gym_lifters_clean.csv"
    else:
        raise FileNotFoundError("No se encuentra gym_lifters_clean.csv. Ejecuta primero la celda de limpieza en el notebook de Pandas")

df = spark.read.option("header", "true").option("inferSchema", "true").csv(data_path)
print(f"‚úÖ Dataset cargado: {df.count():,} filas, {len(df.columns)} columnas")
print(f"üìÅ Archivo: {os.path.basename(data_path)}")
df.show(5, truncate=False)


## Transformaciones

Aplicar filtrado, creaci√≥n de columnas derivadas y agregaciones.


In [None]:
# Transformaci√≥n 1: Filtrado
df_filtered = df.filter(
    (col("total_kg").isNotNull()) & 
    (col("total_kg") > 0) &
    (col("year").isNotNull()) &
    (col("year") >= 2010) &
    (col("year") <= 2025)
)
print(f"‚úÖ Filtrado: {df_filtered.count():,} registros")
df_filtered.select("name", "country", "year", "total_kg").show(5)


In [None]:
# Transformaci√≥n 2: Columnas derivadas
df_with_metrics = df_filtered.withColumn(
    "efficiency_ratio",
    when(col("body_weight_kg") > 0, col("total_kg") / col("body_weight_kg")).otherwise(None)
).withColumn(
    "lift_difference",
    col("clean_and_jerk_kg") - col("snatch_kg")
).withColumn(
    "performance_category",
    when(col("total_kg") >= 350, "Elite")
    .when(col("total_kg") >= 300, "Advanced")
    .when(col("total_kg") >= 250, "Intermediate")
    .otherwise("Beginner")
)
print("‚úÖ Columnas derivadas creadas")
df_with_metrics.select("name", "total_kg", "efficiency_ratio", "performance_category").show(5)


In [None]:
# Transformaci√≥n 3: Agregaciones
agg_by_country = df_with_metrics.groupBy("country", "category") \
    .agg(
        count("*").alias("total_lifters"),
        avg("total_kg").alias("avg_total_kg"),
        max("total_kg").alias("max_total_kg")
    ) \
    .orderBy(desc("avg_total_kg"))
print("‚úÖ Agregaciones calculadas")
agg_by_country.show(10)


## ETL Completo - Modelo Dimensional

Crear tablas de dimensiones y tabla de hechos.


In [None]:
# Crear dim_athlete
print("üî® Creando dim_athlete...")
dim_athlete = df_with_metrics.select("athlete_id", "name", "gender", "age", "country") \
    .distinct().filter(col("name").isNotNull() & (col("name") != ""))

dim_athlete = dim_athlete.withColumn(
    "athlete_id_clean",
    when((col("athlete_id").isNull()) | (col("athlete_id") == ""), None).otherwise(col("athlete_id"))
).withColumn(
    "id_athlete",
    row_number().over(Window.orderBy("athlete_id_clean", "name", "country"))
).withColumn(
    "final_athlete_id",
    coalesce(col("athlete_id_clean"), concat(lit("ath_"), col("id_athlete")))
).select(
    "id_athlete",
    col("final_athlete_id").alias("athlete_id"),
    "name", "gender", "age", "country"
).distinct()

print(f"‚úÖ dim_athlete: {dim_athlete.count():,} atletas √∫nicos")
dim_athlete.show(5)


In [None]:
# Crear dim_competition
print("\nüî® Creando dim_competition...")
dim_competition = df_with_metrics.select("competition", "year", "category") \
    .distinct().filter(col("competition").isNotNull() & col("year").isNotNull())

dim_competition = dim_competition.withColumn(
    "id_competition",
    row_number().over(Window.orderBy("year", "competition", "category"))
).select("id_competition", "competition", "year", "category")

print(f"‚úÖ dim_competition: {dim_competition.count():,} competencias √∫nicas")
dim_competition.show(5)


In [None]:
# Crear dim_team
print("\nüî® Creando dim_team...")
dim_team = df_with_metrics.select("team", "coach") \
    .distinct().filter(col("team").isNotNull())

dim_team = dim_team.withColumn(
    "id_team",
    row_number().over(Window.orderBy("team", "coach"))
).select("id_team", "team", "coach")

print(f"‚úÖ dim_team: {dim_team.count():,} equipos √∫nicos")
dim_team.show(5)


### Tabla de Hechos

Crear fact_lifting relacionando dimensiones con m√©tricas.


In [None]:
# Crear fact_lifting
print("\nüî® Creando fact_lifting...")
fact_base = df_with_metrics.select(
    "athlete_id", "name", "country", "competition", "year", "category",
    "team", "coach", "snatch_kg", "clean_and_jerk_kg", "total_kg",
    "body_weight_kg", "event_rank", "medal", "record_status", "lifting_style",
    "efficiency_ratio", "lift_difference", "performance_category"
)

# Join con dim_athlete (usar name y country como clave principal)
fact_with_athlete = fact_base.join(
    dim_athlete,
    (fact_base.name == dim_athlete.name) & (fact_base.country == dim_athlete.country),
    "inner"
).select(
    fact_base["competition"], fact_base["year"], fact_base["category"],
    fact_base["team"], fact_base["coach"], fact_base["snatch_kg"],
    fact_base["clean_and_jerk_kg"], fact_base["total_kg"], fact_base["body_weight_kg"],
    fact_base["event_rank"], fact_base["medal"], fact_base["record_status"],
    fact_base["lifting_style"], fact_base["efficiency_ratio"],
    fact_base["lift_difference"], fact_base["performance_category"],
    dim_athlete["id_athlete"]
)

# Join con dim_competition
fact_with_competition = fact_with_athlete.join(
    dim_competition,
    (fact_with_athlete.competition == dim_competition.competition) &
    (fact_with_athlete.year == dim_competition.year) &
    (coalesce(fact_with_athlete.category, lit("")) == coalesce(dim_competition.category, lit(""))),
    "inner"
).select(
    fact_with_athlete["id_athlete"], fact_with_athlete["team"], fact_with_athlete["coach"],
    fact_with_athlete["snatch_kg"], fact_with_athlete["clean_and_jerk_kg"],
    fact_with_athlete["total_kg"], fact_with_athlete["body_weight_kg"],
    fact_with_athlete["event_rank"], fact_with_athlete["medal"],
    fact_with_athlete["record_status"], fact_with_athlete["lifting_style"],
    fact_with_athlete["efficiency_ratio"], fact_with_athlete["lift_difference"],
    fact_with_athlete["performance_category"], dim_competition["id_competition"]
)

# Join con dim_team
fact_lifting = fact_with_competition.join(
    dim_team,
    (fact_with_competition.team == dim_team.team) &
    (coalesce(fact_with_competition.coach, lit("")) == coalesce(dim_team.coach, lit(""))),
    "inner"
).select(
    col("id_athlete"), col("id_competition"), col("id_team"),
    "snatch_kg", "clean_and_jerk_kg", "total_kg", "body_weight_kg",
    "event_rank", "medal", "record_status", "lifting_style",
    "efficiency_ratio", "lift_difference", "performance_category"
)

print(f"‚úÖ fact_lifting: {fact_lifting.count():,} registros")
fact_lifting.show(5)


## Carga - Guardar en SQLite


In [None]:
## Carga - Guardar en SQLite

# Preparar y cargar en SQLite
if os.path.exists("../warehouse"):
    os.makedirs("../warehouse", exist_ok=True)
    db_path = "../warehouse/warehouse_pyspark.db"
else:
    os.makedirs("warehouse", exist_ok=True)
    db_path = "warehouse/warehouse_pyspark.db"

if os.path.exists(db_path):
    os.remove(db_path)

conn = sqlite3.connect(db_path)

# Convertir a Pandas y cargar
tables = [
    ("dim_athlete", dim_athlete),
    ("dim_competition", dim_competition),
    ("dim_team", dim_team),
    ("fact_lifting", fact_lifting)
]

print("üìä Cargando tablas en SQLite...")
for name, df_spark in tables:
    df_pd = df_spark.toPandas()
    df_pd.to_sql(name, conn, if_exists="replace", index=False)
    print(f"   ‚úÖ {name}: {len(df_pd):,} registros")

conn.close()
print(f"\n‚úÖ ETL completado! Base de datos: {db_path}")


### Verificaci√≥n

Verificar datos cargados en SQLite.


In [None]:
# Verificaci√≥n final
conn = sqlite3.connect(db_path)
tables = ["dim_athlete", "dim_competition", "dim_team", "fact_lifting"]

print("üìä Resumen de tablas:")
for table in tables:
    count = pd.read_sql_query(f"SELECT COUNT(*) as count FROM {table}", conn)['count'].iloc[0]
    print(f"   ‚úÖ {table}: {count:,} registros")

print("\nüìù Muestras:")
for table in tables:
    display(pd.read_sql_query(f"SELECT * FROM {table} LIMIT 3", conn))

conn.close()


### Consultas de Ejemplo

Ejemplos de consultas SQL sobre el data warehouse.


In [None]:
# Consultas de ejemplo
conn = sqlite3.connect(db_path)

print("üìä Consulta 1: Top 10 atletas por total_kg")
query1 = """
SELECT a.name, a.country, a.gender,
    COUNT(f.id_athlete) as total_competitions,
    AVG(f.total_kg) as avg_total_kg,
    MAX(f.total_kg) as max_total_kg
FROM fact_lifting f
JOIN dim_athlete a ON f.id_athlete = a.id_athlete
GROUP BY a.id_athlete, a.name, a.country, a.gender
ORDER BY max_total_kg DESC LIMIT 10
"""
display(pd.read_sql_query(query1, conn))

print("\nüìä Consulta 2: Promedio por pa√≠s")
query2 = """
SELECT a.country,
    COUNT(DISTINCT a.id_athlete) as num_athletes,
    AVG(f.total_kg) as avg_total_kg,
    MAX(f.total_kg) as max_total_kg
FROM fact_lifting f
JOIN dim_athlete a ON f.id_athlete = a.id_athlete
GROUP BY a.country
ORDER BY avg_total_kg DESC LIMIT 10
"""
display(pd.read_sql_query(query2, conn))

print("\nüìä Consulta 3: Competencias con m√°s participantes")
query3 = """
SELECT c.competition, c.year, c.category,
    COUNT(f.id_athlete) as num_participants,
    AVG(f.total_kg) as avg_total_kg
FROM fact_lifting f
JOIN dim_competition c ON f.id_competition = c.id_competition
GROUP BY c.id_competition, c.competition, c.year, c.category
ORDER BY num_participants DESC LIMIT 10
"""
display(pd.read_sql_query(query3, conn))

conn.close()


In [None]:
# Cerrar SparkSession
spark.stop()
print("‚úÖ SparkSession cerrada")
