In [0]:
# Librerías generales
#=========================
import json
from json import dumps
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import lit, explode, from_json, col, current_date
from pyspark.sql.types import StringType, StructType, StructField, ArrayType
import datetime 
import requests
import urllib3
import matplotlib as mtl
from datetime import datetime,date
import pytz


In [0]:
dbutils.widgets.dropdown("Ambiente", "Produccion", ["Desarrollo","Produccion"])
environment = dbutils.widgets.get("Ambiente")

In [0]:
# Recupera datos de acuerdo al Ambiente de ejecución
# ================================================

if environment == "Produccion":
    storage_account = "santiag0r"
    catalog_gold   = "gold-santiag0r"
    catalog_silver = "silver-santiag0r"
    catalog_bronze = "bronze-santiag0r"
    bucket_gold    = "gold"
    bucket_silver  = "silver"
    bucket_bronze  = "bronze"

elif environment == "Desarrollo":
    storage_account = "santiag0r"
    catalog_gold   = "gold-santiag0r"
    catalog_silver = "silver-santiag0r"
    catalog_bronze = "bronze-santiag0r"
    bucket_gold    = "gold"
    bucket_silver  = "silver"
    bucket_bronze  = "bronze"


In [0]:
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
from datetime import datetime

# 1. Leer el parquet directamente desde HTTPS
url = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/2020.parquet"
df_votes = pd.read_parquet(url, engine="pyarrow")

# 2. Guardar como parquet temporal para Dataset
temp_path = "votes_temp.parquet"
df_votes.to_parquet(temp_path)

# 3. Crear dataset desde el archivo local
dataset_votes = ds.dataset(temp_path, format="parquet")

print("Esquema:")
print(dataset_votes.schema)

# 4. Rango de fechas (DEBEN TENER tz=UTC y ns)
start_date = pa.scalar(
    datetime(2020, 1, 1),
    type=pa.timestamp("ns", tz="UTC")
)

end_date = pa.scalar(
    datetime(2020, 3, 1),
    type=pa.timestamp("ns", tz="UTC")
)

# 5. Filtrado correcto (Enero + Febrero)
votes_filtered = dataset_votes.to_table(
    filter=(
        (ds.field("CreationDate") >= start_date) &
        (ds.field("CreationDate") < end_date)
    )
)

print("Filas filtradas:", votes_filtered.num_rows)




Esquema:
Id: int64
PostId: int64
VoteTypeId: int64
CreationDate: timestamp[ns, tz=UTC]
UserId: int64
BountyAmount: uint64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 975
Filas filtradas: 3223079


In [0]:
from pyspark.sql import SparkSession

# 1) Convertir Arrow Table → Pandas
df_votes_pandas = votes_filtered.to_pandas()

# 2) Fix: convertir uint64 → int64 (Spark no soporta uint64)
if "BountyAmount" in df_votes_pandas.columns:
    df_votes_pandas["BountyAmount"] = df_votes_pandas["BountyAmount"].astype("int64")

# 3) Crear Spark DataFrame
spark = SparkSession.builder.getOrCreate()
df_votes_spark = spark.createDataFrame(df_votes_pandas)

# 4) Ruta de Bronze
ruta_bronze = (
    f"abfss://{bucket_bronze}@{storage_account}.dfs.core.windows.net/"
    "votes/votes_2020.parquet"
)

# 5) Guardar
(
    df_votes_spark
    .coalesce(1)
    .write
    .format("parquet")
    .mode("overwrite")
    .save(ruta_bronze)
)

print("✅ Archivo guardado en Bronze:", ruta_bronze)


✅ Archivo guardado en Bronze: abfss://bronze@santiag0r.dfs.core.windows.net/votes/votes_2020.parquet
