In [0]:
# LIBRERÍAS NECESARIAS
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    current_date, lit, col, trim, regexp_replace, when, to_date,
    current_timestamp, date_format, min as spark_min, max as spark_max
)
from delta.tables import DeltaTable
import pytz
from datetime import datetime

In [0]:
# Configuración de widgets
dbutils.widgets.dropdown("Ambiente", "Produccion", ["Desarrollo","Produccion"])
environment = dbutils.widgets.get("Ambiente")

In [0]:
if environment == "Produccion":
    storage_account = "stuaoprod003"
    catalog_gold   = "gold-shir"
    catalog_silver = "silver-shir"
    catalog_bronze = "bronze-shir"
    bucket_gold    = "gold"
    bucket_silver  = "silver"
    bucket_bronze  = "bronze"

elif environment == "Desarrollo":
    storage_account = "stuaoprod003"
    catalog_gold   = "gold-shir"
    catalog_silver = "silver-shir"
    catalog_bronze = "bronze-shir"
    bucket_gold    = "gold"
    bucket_silver  = "silver"
    bucket_bronze  = "bronze"

In [0]:
tzInfo = pytz.timezone('America/Bogota')
today = datetime.now(tz=tzInfo).strftime('%Y-%m-%d')

In [0]:
# SPARK SESSION
spark = (
    SparkSession.builder.appName("GoldAgg")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

In [0]:
from pyspark.sql import functions as F

from pyspark.sql import functions as F

df_votes_2020 = spark.table(f"`{catalog_silver}`.votes.votes_silver")
df_posts_2020 = spark.table(f"`{catalog_silver}`.posts.posts_silver")

In [0]:
df_votes_2020.show(5)
df_posts_2020.show(5)

+---------+--------+----------+------------+------+------------+------------+----------+
|       Id|  PostId|VoteTypeId|CreationDate|UserId|BountyAmount|VoteTypeName|  f_cargue|
+---------+--------+----------+------------+------+------------+------------+----------+
|202051180|12353288|         2|  2020-01-25|     0|           0|       UpMod|2025-11-16|
|202078426|15302448|         2|  2020-01-25|     0|           0|       UpMod|2025-11-16|
|202068088|17914105|         2|  2020-01-25|     0|           0|       UpMod|2025-11-16|
|202069635|18919091|         2|  2020-01-25|     0|           0|       UpMod|2025-11-16|
|202082244|23331548|         2|  2020-01-25|     0|           0|       UpMod|2025-11-16|
+---------+--------+----------+------------+------+------------+------------+----------+
only showing top 5 rows
+--------+----------+----------------+------------+-----+-----------+--------------------+----------+
|      Id|PostTypeId|AcceptedAnswerId|CreationDate|Score|OwnerUserId|    

In [0]:
from pyspark.sql import functions as F

# ========================================================== 
# MÉTRICA: vote_stats_per_post (año 2020)
# ==========================================================
print("\nGenerando métrica 2020: vote_stats_per_post (votos positivos/negativos por post)...")

schema_gold = "vote_stats_per_post"   # tu schema en gold
year_value = 2020

# 1. Construir el dataframe de métricas a partir de 2020
df_vote_stats = (
    df_votes_2020.groupBy("PostId")
    .agg(
        F.count(F.when(F.col("VoteTypeId") == 2, True)).alias("upvotes"),      # Votos positivos
        F.count(F.when(F.col("VoteTypeId") == 3, True)).alias("downvotes"),    # Votos negativos
        F.count("*").alias("total_votes"),                                     # Total de votos
        F.countDistinct("UserId").alias("unique_voters")                       # Usuarios únicos que votaron
    )
    .join(
        df_posts_2020.select("Id", "PostTypeId"),
        F.col("PostId") == F.col("Id"),
        "left"
    )
    .drop("Id")
    .withColumn("year", F.lit(year_value))
    .withColumn("fecha_cargue", F.current_timestamp())
)



Generando métrica 2020: vote_stats_per_post (votos positivos/negativos por post)...


In [0]:
# 2. Nombre completo de la tabla GOLD
gold_table = f"`{catalog_gold}`.{schema_gold}.vote_stats_per_post"
# => `gold-shir`.vote_stats_per_post.vote_stats_per_post

In [0]:

# 3. Crear schema y tabla Delta si no existen
spark.sql(f"""
CREATE SCHEMA IF NOT EXISTS `{catalog_gold}`.{schema_gold}
""")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {gold_table} (
    PostId BIGINT,
    upvotes BIGINT,
    downvotes BIGINT,
    total_votes BIGINT,
    unique_voters BIGINT,
    PostTypeId STRING,
    year INT,
    fecha_cargue TIMESTAMP
)
USING DELTA
""")

# ==========================================================
# VALIDACIÓN PRE-MERGE: nuevos vs actualizados
# ==========================================================
df_target = spark.table(gold_table)

# Llaves únicas en target y en source
df_target_keys = df_target.select("PostId", "year").dropDuplicates()
df_source_keys = df_vote_stats.select("PostId", "year").dropDuplicates()

# Nuevos: están en source y no en target
df_nuevos = df_source_keys.join(df_target_keys, ["PostId", "year"], "left_anti")
nuevos = df_nuevos.count()

# Actualizados: están en ambos
df_actualizados = df_source_keys.join(df_target_keys, ["PostId", "year"], "inner")
actualizados = df_actualizados.count()

print(f"Registros nuevos: {nuevos}")
print(f"Registros a actualizar: {actualizados}")

Registros nuevos: 0
Registros a actualizar: 1775669


In [0]:
# 4. Crear vista temporal para realizar MERGE
df_vote_stats.createOrReplaceTempView("temp_vote_stats")

In [0]:
# 5. MERGE incremental en GOLD
spark.sql(f"""
MERGE INTO {gold_table} t
USING temp_vote_stats s
ON t.PostId = s.PostId AND t.year = s.year
WHEN MATCHED THEN UPDATE SET
  t.upvotes       = s.upvotes,
  t.downvotes     = s.downvotes,
  t.total_votes   = s.total_votes,
  t.unique_voters = s.unique_voters,
  t.PostTypeId    = s.PostTypeId,
  t.fecha_cargue  = s.fecha_cargue
WHEN NOT MATCHED THEN INSERT *
""")

print("Tabla GOLD 'vote_stats_per_post' actualizada correctamente para 2020")

Tabla GOLD 'vote_stats_per_post' actualizada correctamente para 2020


In [0]:

# 6. Validación rápida
print("\n--- VALIDACIÓN: gold-shir.vote_stats_per_post.vote_stats_per_post ---")
spark.table(gold_table).show(5, truncate=False)


--- VALIDACIÓN: gold-shir.vote_stats_per_post.vote_stats_per_post ---
+--------+-------+---------+-----------+-------------+----------+----+--------------------------+
|PostId  |upvotes|downvotes|total_votes|unique_voters|PostTypeId|year|fecha_cargue              |
+--------+-------+---------+-----------+-------------+----------+----+--------------------------+
|59561526|1      |0        |1          |1            |Answer    |2020|2025-11-16 20:23:05.039341|
|59621109|3      |0        |4          |1            |Answer    |2020|2025-11-16 20:23:05.039341|
|57376924|0      |0        |1          |1            |NULL      |2020|2025-11-16 20:23:05.039341|
|42438709|10     |0        |10         |1            |NULL      |2020|2025-11-16 20:23:05.039341|
|12166187|1      |0        |1          |1            |NULL      |2020|2025-11-16 20:23:05.039341|
+--------+-------+---------+-----------+-------------+----------+----+--------------------------+
only showing top 5 rows
