In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import matplotlib.pyplot as plt

spark = SparkSession.builder \
    .appName("HackathonForecast") \
    .master("local[*]") \
    .config("spark.driver.memory", "12g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

print("SparkSession creada exitosamente!")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/18 10:34:28 WARN Utils: Your hostname, QUIN-DAT-A0012, resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp2s0)
25/09/18 10:34:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/18 10:34:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


SparkSession creada exitosamente!


In [2]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, avg, stddev, col, sum, weekofyear, month, floor, quarter, to_date, lit

# --- 1. CARGAR DATOS LIMPIOS DE LA CAPA SILVER ---
base_path = "/home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/"
silver_path = f"{base_path}silver/datos_limpios"

print("Cargando datos limpios desde la capa Silver...")
df_final = spark.read.parquet(silver_path)

# --- NUEVA FEATURE: CALCULAR PRECIO PROMEDIO POR PRODUCTO ---
print("Calculando feature de precio promedio por producto...")
# CAMBIO: Se reemplazó "Faturamento" por "gross_value"
precios_producto = df_final.withColumn("precio", col("gross_value") / col("quantity")) \
                           .groupBy("produto").agg(avg("precio").alias("precio_promedio_prod"))

# --- NUEVA FEATURE: SEMANAS IMPORTANTES (FESTIVOS/EVENTOS) ---
print("Creando feature de semanas importantes...")
fechas_importantes_2022 = [
    ("2022-01-01",), # Año Nuevo
    ("2022-02-14",), # San Valentín
    ("2022-05-08",), # Día de la Madre (ejemplo, puede variar por país)
    ("2022-07-04",), # 4 de Julio (ejemplo USA)
    ("2022-11-24",), # Semana de Thanksgiving / Black Friday
    ("2022-12-25",)  # Navidad
]
df_fechas = spark.createDataFrame(fechas_importantes_2022, ["fecha_str"])
df_semanas_importantes = df_fechas.withColumn("semana", weekofyear(to_date(col("fecha_str")))) \
                                .withColumn("es_semana_importante", lit(1)) \
                                .select("semana", "es_semana_importante").distinct()

# --- 2. CREAR TABLAS DE DIMENSIONES (DESCRIPTIVAS) ---
features_produto = df_final.select("produto", "categoria", "label", "subcategoria", "marca").distinct()
features_pdv = df_final.select("pdv", "premise", "categoria_pdv").distinct()

# --- 3. AGREGACIÓN SEMANAL ---
print("Agregando transacciones a nivel semanal...")
df_semanal = df_final.groupBy("pdv", "produto", weekofyear("transaction_date").alias("semana")) \
    .agg(
        sum("quantity").alias("cantidad_total_semanal"),
        avg(quarter("transaction_date")).alias("trimestre")
    )

# --- 4. ENRIQUECER DATOS SEMANALES CON LAS DIMENSIONES ---
print("Reincorporando las features descriptivas...")
df_enriquecido = df_semanal.join(features_produto, "produto", "left") \
                           .join(features_pdv, "pdv", "left")

# --- 5. CREACIÓN DE FEATURES DE LAG Y VENTANA MÓVIL ---
print("Creando features de Lag y Ventana Móvil...")
windowSpec = Window.partitionBy("pdv", "produto").orderBy("semana")

df_con_features = df_enriquecido \
    .withColumn("lag_1", lag("cantidad_total_semanal", 1, 0).over(windowSpec)) \
    .withColumn("lag_2", lag("cantidad_total_semanal", 2, 0).over(windowSpec)) \
    .withColumn("lag_4", lag("cantidad_total_semanal", 4, 0).over(windowSpec)) \
    .withColumn("media_movil_4_semanas", avg("cantidad_total_semanal").over(windowSpec.rowsBetween(-3, 0))) \
    .withColumn("stddev_movil_4_semanas", stddev("cantidad_total_semanal").over(windowSpec.rowsBetween(-3, 0)))

# --- 6. CREACIÓN DE FEATURES DE CALENDARIO ---
print("Creando features de calendario...")
df_con_features = df_con_features.withColumn("mes", floor((col("semana") - 1) / 4.34) + 1)

# --- 7. UNIR FEATURES ADICIONALES ---
print("Uniendo características de precio y semanas importantes...")
df_con_features = df_con_features.join(precios_producto, "produto", "left")
df_con_features = df_con_features.join(df_semanas_importantes, "semana", "left")

# Rellenar nulos al final para todas las columnas
df_listo_para_modelo = df_con_features.fillna(0)

print("\n¡Ingeniería de Features completada! (Ahora con todas las columnas)")
print("Muestra del DataFrame final:")
df_listo_para_modelo.show()

print("\nEsquema final del DataFrame:")
df_listo_para_modelo.printSchema()

Cargando datos limpios desde la capa Silver...


                                                                                

Calculando feature de precio promedio por producto...
Creando feature de semanas importantes...
Agregando transacciones a nivel semanal...
Reincorporando las features descriptivas...
Creando features de Lag y Ventana Móvil...
Creando features de calendario...
Uniendo características de precio y semanas importantes...

¡Ingeniería de Features completada! (Ahora con todas las columnas)
Muestra del DataFrame final:


[Stage 21:>                                                         (0 + 1) / 1]

+------+-------------------+-------------------+----------------------+---------+---------+------+------------+--------------------+----------+-------------+-----+-----+-----+---------------------+----------------------+---+--------------------+--------------------+
|semana|            produto|                pdv|cantidad_total_semanal|trimestre|categoria| label|subcategoria|               marca|   premise|categoria_pdv|lag_1|lag_2|lag_4|media_movil_4_semanas|stddev_movil_4_semanas|mes|precio_promedio_prod|es_semana_importante|
+------+-------------------+-------------------+----------------------+---------+---------+------+------------+--------------------+----------+-------------+-----+-----+-----+---------------------+----------------------+---+--------------------+--------------------+
|     6|1837429607327399565|1000237487041964405|                   1.0|      1.0|  PACKAGE|  CORE|         IPA|FIREMAKERPERFECTM...| ONPREMISE|       WINERY|  0.0|  0.0|  0.0|                  1.0|  

                                                                                

In [7]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, FeatureHasher
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, when, sum as spark_sum, abs as spark_abs
# Importar las herramientas para el ajuste de hiperparámetros
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# --- 1. IDENTIFICAR TIPOS DE FEATURES ---
TARGET_COL = "cantidad_total_semanal"
CATEGORICAL_COLS = [
    "pdv", "produto", "categoria", "label", "subcategoria",
    "marca", "premise", "categoria_pdv"
]
# Asegúrate de incluir TODAS tus nuevas features aquí
NUMERICAL_COLS = [
    "semana", "lag_1", "lag_2", "lag_4",
    "media_movil_4_semanas", "stddev_movil_4_semanas", "mes",
    "precio_promedio_prod", "trimestre", "es_semana_importante"
]

# --- 2. DEFINIR EL PIPELINE (EL "BLUEPRINT" DE NUESTRO MODELO) ---
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep") for c in CATEGORICAL_COLS]
indexed_cols = [f"{c}_idx" for c in CATEGORICAL_COLS]
hasher = FeatureHasher(inputCols=indexed_cols, outputCol="hashed_features", numFeatures=1024)
feature_sources = ["hashed_features"] + NUMERICAL_COLS
assembler = VectorAssembler(inputCols=feature_sources, outputCol="features")
gbt = GBTRegressor(featuresCol="features", labelCol=TARGET_COL, seed=42) # Añadir una semilla para reproducibilidad
pipeline = Pipeline(stages=indexers + [hasher, assembler, gbt])

# --- 3. DIVIDIR LOS DATOS ---
print("Dividiendo los datos en entrenamiento y prueba...")
train_data = df_listo_para_modelo.filter(col("semana") <= 40)
test_data = df_listo_para_modelo.filter( (col("semana") > 40) & (col("semana") < 50) )

# --- 4. BÚSQUEDA DE HIPERPARÁMETROS (GRID SEARCH) ---
print("\nIniciando la búsqueda de hiperparámetros en una muestra...")

# Paso 1: Definir la cuadrícula de parámetros a probar.
# Empieza con pocas opciones para que sea rápido. Puedes ampliarla después.
# Paso 1: Definir la cuadrícula de parámetros a probar.
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [4, 6]) \
    .addGrid(gbt.maxBins, [32, 64]) \
    .addGrid(gbt.stepSize, [0.1, 0.05]) \
    .build()

# Paso 2: Definir el evaluador. Usaremos RMSE para la búsqueda.
evaluator = RegressionEvaluator(labelCol=TARGET_COL, predictionCol="prediction", metricName="rmse")

# Paso 3: Configurar el CrossValidator.
# Dividirá la muestra en 3 partes, entrenará en 2 y validará en 1, rotando las partes.
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3,
                          parallelism=4) # Aumenta el paralelismo si tienes múltiples núcleos de CPU

# Paso 4: Tomar una muestra de los datos para que la búsqueda sea rápida.
print("Creando una muestra del 10% de los datos de entrenamiento...")
train_sample = train_data.sample(withReplacement=False, fraction=0.1, seed=42).cache()
print(f"Iniciando CrossValidator en una muestra de {train_sample.count()} filas...")

# Paso 5: Ejecutar la búsqueda. Este es el paso que entrena múltiples modelos.
cvModel = crossval.fit(train_sample)
train_sample.unpersist() # Liberar la memoria
print("¡Búsqueda de hiperparámetros completada!")

# Puedes inspeccionar los mejores parámetros encontrados
best_gbt_model = cvModel.bestModel.stages[-1]
print("\nMejores Hiperparámetros encontrados:")
print(f"- maxDepth: {best_gbt_model.getMaxDepth()}")
print(f"- maxBins: {best_gbt_model.getMaxBins()}")
print(f"- stepSize: {best_gbt_model.getStepSize()}")

# Ahora puedes proceder a re-entrenar tu modelo con estos parámetros y el dataset completo.

Dividiendo los datos en entrenamiento y prueba...

Iniciando la búsqueda de hiperparámetros en una muestra...
Creando una muestra del 10% de los datos de entrenamiento...


                                                                                

Iniciando CrossValidator en una muestra de 429754 filas...


25/09/17 14:52:48 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/09/17 14:52:48 WARN DAGScheduler: Broadcasting large task binary with size 1037.8 KiB
25/09/17 14:52:50 WARN DAGScheduler: Broadcasting large task binary with size 1037.7 KiB
25/09/17 14:52:50 WARN DAGScheduler: Broadcasting large task binary with size 1037.8 KiB
25/09/17 14:52:51 WARN DAGScheduler: Broadcasting large task binary with size 1037.8 KiB
25/09/17 14:52:52 WARN DAGScheduler: Broadcasting large task binary with size 1037.7 KiB
25/09/17 14:52:52 WARN DAGScheduler: Broadcasting large task binary with size 1037.7 KiB
25/09/17 14:52:53 WARN DAGScheduler: Broadcasting large task binary with size 1051.7 KiB
25/09/17 14:52:53 WARN DAGScheduler: Broadcasting large task binary with size 1051.7 KiB
25/09/17 14:52:55 WARN DAGScheduler: Broadcasting large task binary with size 1051.7 KiB
25/09/17 1

¡Búsqueda de hiperparámetros completada!

Mejores Hiperparámetros encontrados:
- maxDepth: 4
- maxBins: 64
- stepSize: 0.05


                                                                                

In [8]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, FeatureHasher
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, when, round, sum as spark_sum, abs as spark_abs, lit, avg, stddev, countDistinct

# --- 1. IDENTIFICAR TIPOS DE FEATURES ---
# Asegúrate de que df_listo_para_modelo esté disponible desde tu notebook anterior
TARGET_COL = "cantidad_total_semanal"
CATEGORICAL_COLS = [
    "pdv", "produto", "categoria", "label", "subcategoria",
    "marca", "premise", "categoria_pdv"
]
NUMERICAL_COLS = [
    "semana", "lag_1", "lag_2", "lag_4",
    "media_movil_4_semanas", "stddev_movil_4_semanas", "mes",
    "precio_promedio_prod", "trimestre", "es_semana_importante"
]

# --- 2. DEFINIR EL PIPELINE FINAL CON LOS MEJORES HIPERPARÁMETROS ---
print("Configurando el pipeline final con los mejores hiperparámetros...")

indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep") for c in CATEGORICAL_COLS]
indexed_cols = [f"{c}_idx" for c in CATEGORICAL_COLS]
hasher = FeatureHasher(inputCols=indexed_cols, outputCol="hashed_features", numFeatures=1024)
feature_sources = ["hashed_features"] + NUMERICAL_COLS
assembler = VectorAssembler(inputCols=feature_sources, outputCol="features")

# Aplicamos los mejores parámetros que encontraste
gbt_final = GBTRegressor(featuresCol="features", labelCol=TARGET_COL,
                         maxDepth=4,
                         maxBins=64,
                         stepSize=0.05,
                         seed=42)

final_pipeline = Pipeline(stages=indexers + [hasher, assembler, gbt_final])

# --- 3. ENTRENAR EL MODELO FINAL CON TODOS LOS DATOS ---
print("Dividiendo los datos...")
train_data = df_listo_para_modelo.filter(col("semana") <= 40)
test_data = df_listo_para_modelo.filter( (col("semana") > 40) & (col("semana") < 50) )

print(f"Entrenando el modelo final en las {train_data.count()} filas completas de entrenamiento...")
final_model = final_pipeline.fit(train_data)
print("¡Modelo final entrenado!")

# --- 4. EVALUAR EL MODELO FINAL ---
print("\nEvaluando el modelo final en el conjunto de prueba...")
predictions = final_model.transform(test_data)
predictions = predictions.withColumn("prediction", when(col("prediction") < 0, 0).otherwise(col("prediction")))

evaluator = RegressionEvaluator(labelCol=TARGET_COL, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"RMSE del modelo final = {rmse}")

wmape_df = predictions.agg(
    (spark_sum(spark_abs(col(TARGET_COL) - col("prediction"))) / spark_sum(col(TARGET_COL))).alias("wmape")
)
wmape = wmape_df.collect()[0]["wmape"]
print(f"WMAPE del modelo final = {wmape * 100:.2f}%")



Configurando el pipeline final con los mejores hiperparámetros...
Dividiendo los datos...


                                                                                

Entrenando el modelo final en las 4296393 filas completas de entrenamiento...


25/09/17 16:18:29 WARN DAGScheduler: Broadcasting large task binary with size 1392.0 KiB
25/09/17 16:18:30 WARN DAGScheduler: Broadcasting large task binary with size 1391.9 KiB
25/09/17 16:18:40 WARN DAGScheduler: Broadcasting large task binary with size 1405.9 KiB
25/09/17 16:18:49 WARN DAGScheduler: Broadcasting large task binary with size 1419.9 KiB
25/09/17 16:18:53 WARN MemoryStore: Not enough space to cache rdd_14744_8 in memory! (computed 161.8 MiB so far)
25/09/17 16:18:53 WARN MemoryStore: Not enough space to cache rdd_14744_3 in memory! (computed 242.8 MiB so far)
25/09/17 16:18:53 WARN MemoryStore: Not enough space to cache rdd_14744_9 in memory! (computed 161.8 MiB so far)
25/09/17 16:18:53 WARN MemoryStore: Not enough space to cache rdd_14744_1 in memory! (computed 242.8 MiB so far)
25/09/17 16:18:53 WARN MemoryStore: Not enough space to cache rdd_14744_7 in memory! (computed 242.8 MiB so far)
25/09/17 16:18:53 WARN MemoryStore: Not enough space to cache rdd_14744_2 in me



25/09/17 16:25:45 WARN MemoryStore: Not enough space to cache rdd_14744_14 in memory! (computed 576.2 MiB so far)
25/09/17 16:25:47 WARN MemoryStore: Not enough space to cache rdd_14744_2 in memory! (computed 576.2 MiB so far)
25/09/17 16:25:50 WARN DAGScheduler: Broadcasting large task binary with size 1479.1 KiB
25/09/17 16:25:50 WARN MemoryStore: Not enough space to cache rdd_14744_15 in memory! (computed 71.9 MiB so far)
25/09/17 16:25:50 WARN MemoryStore: Not enough space to cache rdd_14744_11 in memory! (computed 107.9 MiB so far)
25/09/17 16:25:50 WARN MemoryStore: Not enough space to cache rdd_14744_12 in memory! (computed 107.9 MiB so far)
25/09/17 16:25:50 WARN MemoryStore: Not enough space to cache rdd_14744_2 in memory! (computed 107.9 MiB so far)
25/09/17 16:25:50 WARN MemoryStore: Not enough space to cache rdd_14744_10 in memory! (computed 30.5 MiB so far)
25/09/17 16:25:50 WARN MemoryStore: Not enough space to cache rdd_14744_5 in memory! (computed 107.9 MiB so far)
25/0



25/09/17 16:26:25 WARN MemoryStore: Not enough space to cache rdd_14744_0 in memory! (computed 364.2 MiB so far)
25/09/17 16:26:25 WARN MemoryStore: Not enough space to cache rdd_14744_5 in memory! (computed 364.2 MiB so far)
25/09/17 16:26:25 WARN MemoryStore: Not enough space to cache rdd_14744_9 in memory! (computed 161.8 MiB so far)
25/09/17 16:26:25 WARN MemoryStore: Not enough space to cache rdd_14744_14 in memory! (computed 364.2 MiB so far)
25/09/17 16:26:25 WARN MemoryStore: Not enough space to cache rdd_14744_7 in memory! (computed 107.9 MiB so far)
25/09/17 16:26:25 WARN MemoryStore: Not enough space to cache rdd_14744_15 in memory! (computed 161.8 MiB so far)
25/09/17 16:26:25 WARN MemoryStore: Not enough space to cache rdd_14744_8 in memory! (computed 242.8 MiB so far)
25/09/17 16:26:25 WARN MemoryStore: Not enough space to cache rdd_14744_13 in memory! (computed 242.8 MiB so far)
25/09/17 16:26:25 WARN MemoryStore: Not enough space to cache rdd_14744_2 in memory! (compute

¡Modelo final entrenado!

Evaluando el modelo final en el conjunto de prueba...


25/09/17 16:30:44 WARN DAGScheduler: Broadcasting large task binary with size 1394.7 KiB
25/09/17 16:30:49 WARN DAGScheduler: Broadcasting large task binary with size 1395.8 KiB
                                                                                

RMSE del modelo final = 7.88536696216952


25/09/17 16:31:03 WARN DAGScheduler: Broadcasting large task binary with size 1164.3 KiB

WMAPE del modelo final = 30.59%


                                                                                

In [9]:

# --- 5. PREPARAR DATOS PARA LA SUMISIÓN DE ENERO 2023 ---
print("\nFiltrando combinaciones de PDV/Produto con actividad frecuente y reciente...")

df_reciente = df_listo_para_modelo.filter(col("semana") > 42)
pdv_produto_frecuentes = df_reciente.groupBy("pdv", "produto") \
                                    .agg(countDistinct("semana").alias("num_semanas_vendidas"))
pdv_produto_recientes = pdv_produto_frecuentes.filter(col("num_semanas_vendidas") >= 2) \
                                              .select("pdv", "produto")
pdv_produto_recientes.cache()
print(f"Se encontraron {pdv_produto_recientes.count()} combinaciones activas y frecuentes para la predicción.")

semanas_enero = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ["semana"])
df_enero_2023 = pdv_produto_recientes.crossJoin(semanas_enero)

print("Generando características para las predicciones de Enero...")
df_ultimas_semanas = df_listo_para_modelo.filter(col("semana") >= 49)
last_features = df_ultimas_semanas.groupBy("pdv", "produto") \
    .agg(
        avg("cantidad_total_semanal").alias("media_movil_4_semanas"),
        stddev("cantidad_total_semanal").alias("stddev_movil_4_semanas"),
        avg("cantidad_total_semanal").alias("lag_1"),
        avg("cantidad_total_semanal").alias("lag_2"),
        avg("cantidad_total_semanal").alias("lag_4"),
        # Necesitamos llevar las nuevas features al futuro
        lit(1).alias("trimestre"),
        lit(1).alias("es_semana_importante") # Asumimos que la primera semana de Enero es importante
    ).fillna(0)

df_enero_con_features = df_enero_2023.join(last_features, ["pdv", "produto"], "left") \
                                     .withColumn("mes", lit(1))

features_produto_desc = df_listo_para_modelo.select("produto", "categoria", "label", "subcategoria", "marca", "precio_promedio_prod").distinct()
features_pdv_desc = df_listo_para_modelo.select("pdv", "premise", "categoria_pdv").distinct()

df_enero_final = df_enero_con_features.join(features_produto_desc, "produto", "left") \
                                      .join(features_pdv_desc, "pdv", "left") \
                                      .fillna(0)
pdv_produto_recientes.unpersist()

# --- 6. GENERAR Y GUARDAR ARCHIVO DE SUMISIÓN ---
print("Realizando predicciones para Enero 2023...")
predicciones_enero = final_model.transform(df_enero_final)

df_submission = predicciones_enero.select(
    col("semana"),
    col("pdv"),
    col("produto"),
    when(col("prediction") < 0, 0).otherwise(round(col("prediction"))).cast("integer").alias("quantidade")
)

df_submission = df_submission.filter(col("quantidade") > 0)

print(f"Número final de filas a guardar: {df_submission.count()}")
df_submission.show(10)

submission_path = "/home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_parquet_2"
print(f"Guardando archivo de submisión en formato Parquet en: {submission_path}")

df_submission.repartition(1).write.mode("overwrite").parquet(submission_path)

print("¡Archivo de submisión final generado exitosamente!")


Filtrando combinaciones de PDV/Produto con actividad frecuente y reciente...


                                                                                

Se encontraron 272186 combinaciones activas y frecuentes para la predicción.
Generando características para las predicciones de Enero...
Realizando predicciones para Enero 2023...


25/09/17 16:32:07 WARN DAGScheduler: Broadcasting large task binary with size 1548.0 KiB
                                                                                

Número final de filas a guardar: 1360930


25/09/17 16:33:02 WARN DAGScheduler: Broadcasting large task binary with size 1546.2 KiB
                                                                                

+------+-------------------+-------------------+----------+
|semana|                pdv|            produto|quantidade|
+------+-------------------+-------------------+----------+
|     1|1004779246734143594|1029370090212151375|         2|
|     2|1004779246734143594|1029370090212151375|         2|
|     3|1004779246734143594|1029370090212151375|         2|
|     4|1004779246734143594|1029370090212151375|         2|
|     5|1004779246734143594|1029370090212151375|         2|
|     1|1004779246734143594|1657665165780983454|        11|
|     2|1004779246734143594|1657665165780983454|        11|
|     3|1004779246734143594|1657665165780983454|        11|
|     4|1004779246734143594|1657665165780983454|        11|
|     5|1004779246734143594|1657665165780983454|        11|
+------+-------------------+-------------------+----------+
only showing top 10 rows
Guardando archivo de submisión en formato Parquet en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_parqu

25/09/17 16:33:48 WARN DAGScheduler: Broadcasting large task binary with size 1539.3 KiB
[Stage 7556:>                                                       (0 + 1) / 1]

¡Archivo de submisión final generado exitosamente!


                                                                                

## XGBoost

In [3]:
pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install pyarrow

Collecting pyarrow
  Using cached pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Using cached pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl (42.8 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-21.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, FeatureHasher
from xgboost.spark import SparkXGBRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, when, round, sum as spark_sum, abs as spark_abs, lit, avg, stddev, countDistinct
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# --- 1. IDENTIFICAR TIPOS DE FEATURES ---
TARGET_COL = "cantidad_total_semanal"
CATEGORICAL_COLS = [
    "pdv", "produto", "categoria", "label", "subcategoria",
    "marca", "premise", "categoria_pdv"
]
NUMERICAL_COLS = [
    "semana", "lag_1", "lag_2", "lag_4",
    "media_movil_4_semanas", "stddev_movil_4_semanas", "mes",
    "precio_promedio_prod", "trimestre", "es_semana_importante"
]

# --- 2. DEFINIR EL PIPELINE CON XGBOOST ---
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep") for c in CATEGORICAL_COLS]
indexed_cols = [f"{c}_idx" for c in CATEGORICAL_COLS]
hasher = FeatureHasher(inputCols=indexed_cols, outputCol="hashed_features", numFeatures=1024)
feature_sources = ["hashed_features"] + NUMERICAL_COLS
assembler = VectorAssembler(inputCols=feature_sources, outputCol="features")

xgb = SparkXGBRegressor(
    features_col="features",
    label_col=TARGET_COL,
    seed=42
)

pipeline_xgb = Pipeline(stages=indexers + [hasher, assembler, xgb])

# --- 3. BÚSQUEDA DE HIPERPARÁMETROS PARA XGBOOST ---
print("\nIniciando la búsqueda de hiperparámetros para XGBoost en una muestra...")

# CAMBIO CRÍTICO: Se reemplazó 'eta' por 'learning_rate'
paramGrid_xgb = ParamGridBuilder() \
    .addGrid(xgb.max_depth, [4, 6]) \
    .addGrid(xgb.n_estimators, [20, 50]) \
    .addGrid(xgb.learning_rate, [0.1, 0.05]) \
    .build()

evaluator = RegressionEvaluator(labelCol=TARGET_COL, predictionCol="prediction", metricName="rmse")
crossval_xgb = CrossValidator(estimator=pipeline_xgb,
                              estimatorParamMaps=paramGrid_xgb,
                              evaluator=evaluator,
                              numFolds=3)

train_data = df_listo_para_modelo.filter(col("semana") <= 40)
test_data = df_listo_para_modelo.filter( (col("semana") > 40) & (col("semana") < 50) )

train_sample = train_data.sample(withReplacement=False, fraction=0.1, seed=42).cache()
print(f"Iniciando CrossValidator en una muestra de {train_sample.count()} filas...")

cvModel_xgb = crossval_xgb.fit(train_sample)
train_sample.unpersist()
print("¡Búsqueda de hiperparámetros para XGBoost completada!")

# Imprimir los mejores hiperparámetros encontrados
best_xgb_model = cvModel_xgb.bestModel.stages[-1]
print("\nMejores Hiperparámetros encontrados para XGBoost:")
print(f"- max_depth: {best_xgb_model.getOrDefault('max_depth')}")
print(f"- n_estimators: {best_xgb_model.getOrDefault('n_estimators')}")
print(f"- learning_rate: {best_xgb_model.getOrDefault('learning_rate')}") # CAMBIO: Imprimir 'learning_rate'

# --- 4. ENTRENAR EL MODELO FINAL DE XGBOOST ---
print("\nEntrenando el modelo final de XGBoost con los mejores parámetros...")

best_params = best_xgb_model.extractParamMap()
final_xgb_estimator = xgb.copy(extra=best_params)
final_pipeline_xgb = Pipeline(stages=[*indexers, hasher, assembler, final_xgb_estimator])

print(f"Entrenando en el dataset completo de {train_data.count()} filas...")
final_model_xgb = final_pipeline_xgb.fit(train_data)
print("¡Modelo final de XGBoost entrenado!")

# --- 5. EVALUAR Y GENERAR SUMISIÓN ---
# (Esta parte no necesita cambios)
print("\nEvaluando el modelo final de XGBoost en el conjunto de prueba...")
predictions = final_model_xgb.transform(test_data)
predictions = predictions.withColumn("prediction", when(col("prediction") < 0, 0).otherwise(col("prediction")))
rmse = evaluator.evaluate(predictions)
print(f"RMSE del modelo final XGBoost = {rmse}")
wmape_df = predictions.agg(
    (spark_sum(spark_abs(col(TARGET_COL) - col("prediction"))) / spark_sum(col(TARGET_COL))).alias("wmape")
)
wmape = wmape_df.collect()[0]["wmape"]
print(f"WMAPE del modelo final XGBoost = {wmape * 100:.2f}%")




Iniciando la búsqueda de hiperparámetros para XGBoost en una muestra...


25/09/17 21:07:33 WARN CacheManager: Asked to cache already cached data.
25/09/17 21:07:35 WARN CacheManager: Asked to cache already cached data.        
25/09/17 21:07:35 WARN CacheManager: Asked to cache already cached data.


Iniciando CrossValidator en una muestra de 429754 filas...


INFO:XGBoost-PySpark:Running xgboost-3.0.5 on 1 workers with                    
	booster params: {'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 4, 'objective': 'reg:squarederror', 'seed': 42, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 20}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2025-09-17 21:08:31,874 INFO XGBoost-PySpark: _train_booster Training on CPUs 1]
[21:08:32] Task 0 got rank 0
[21:08:52] [0]	training-rmse:7.48334
[21:08:52] [1]	training-rmse:6.91248
[21:08:53] [2]	training-rmse:6.41026
[21:08:53] [3]	training-rmse:5.97096
[21:08:53] [4]	training-rmse:5.58715
[21:08:54] [5]	training-rmse:5.25433
[21:08:54] [6]	training-rmse:4.96709
[21:08:54] [7]	training-rmse:4.71980
[21:08:55] [8]	training-rmse:4.50759
[21:08:55] [9]	training-rmse:4.32633
[21:08:55] [10]	training-rmse:4.17145
[21:08:56] [11]	training-rmse:4.04028
[21:08:56] [12]	training-rmse:3.92912
[21:08:56] [13]	training-rmse:3.83157
[21:08:57] [14]	training-rmse:3.75

¡Búsqueda de hiperparámetros para XGBoost completada!

Mejores Hiperparámetros encontrados para XGBoost:
- max_depth: 6
- n_estimators: 50
- learning_rate: 0.1

Entrenando el modelo final de XGBoost con los mejores parámetros...


                                                                                

Entrenando en el dataset completo de 4296393 filas...


INFO:XGBoost-PySpark:Running xgboost-3.0.5 on 1 workers with      (15 + 2) / 17]
	booster params: {'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'reg:squarederror', 'seed': 42, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
25/09/17 21:48:09 WARN DAGScheduler: Broadcasting large task binary with size 1158.3 KiB
2025-09-17 21:48:32,371 INFO XGBoost-PySpark: _train_booster Training on CPUs 1]
[21:48:33] Task 0 got rank 0


In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, FeatureHasher
from xgboost.spark import SparkXGBRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, when, round, sum as spark_sum, abs as spark_abs, lit, avg, stddev, countDistinct

# --- 0. CREAR UNA MUESTRA PARA UN PROCESO MÁS RÁPIDO ---
# (Asegúrate de que df_listo_para_modelo esté disponible desde tu notebook de ingeniería de características)
print("Creando una muestra del 30% del dataset completo...")
df_listo_para_modelo_sample = df_listo_para_modelo.sample(withReplacement=False, fraction=0.35, seed=42)
df_listo_para_modelo_sample.cache() # Cachear la muestra para acelerar los siguientes pasos
print(f"La muestra contiene {df_listo_para_modelo_sample.count()} filas.")


# --- 1. IDENTIFICAR TIPOS DE FEATURES ---
TARGET_COL = "cantidad_total_semanal"
CATEGORICAL_COLS = [
    "pdv", "produto", "categoria", "label", "subcategoria",
    "marca", "premise", "categoria_pdv"
]
NUMERICAL_COLS = [
    "semana", "lag_1", "lag_2", "lag_4",
    "media_movil_4_semanas", "stddev_movil_4_semanas", "mes",
    "precio_promedio_prod", "trimestre", "es_semana_importante"
]

# --- 2. DEFINIR EL PIPELINE FINAL CON LOS MEJORES HIPERPARÁMETROS ---
print("Configurando el pipeline final con los mejores hiperparámetros...")
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep") for c in CATEGORICAL_COLS]
indexed_cols = [f"{c}_idx" for c in CATEGORICAL_COLS]
hasher = FeatureHasher(inputCols=indexed_cols, outputCol="hashed_features", numFeatures=1024)
feature_sources = ["hashed_features"] + NUMERICAL_COLS
assembler = VectorAssembler(inputCols=feature_sources, outputCol="features")

final_xgb_estimator = SparkXGBRegressor(
    features_col="features",
    label_col=TARGET_COL,
    max_depth=6,
    n_estimators=50,
    learning_rate=0.1,
    seed=42,
    tree_method='hist'
)
final_pipeline = Pipeline(stages=indexers + [hasher, assembler, final_xgb_estimator])

# --- 3. ENTRENAR EL MODELO FINAL (SOBRE LA MUESTRA) ---
print("Dividiendo la muestra para entrenamiento y prueba...")
# CAMBIO: Usamos el DataFrame de la muestra
train_data = df_listo_para_modelo_sample.filter(col("semana") <= 40)
test_data = df_listo_para_modelo_sample.filter( (col("semana") > 40) & (col("semana") < 50) )

print(f"Entrenando el modelo final en las {train_data.count()} filas de la muestra de entrenamiento...")
final_model = final_pipeline.fit(train_data)
print("¡Modelo final de XGBoost entrenado!")

# --- 4. EVALUAR EL MODELO FINAL ---
print("\nEvaluando el modelo final en el conjunto de prueba de la muestra...")
predictions = final_model.transform(test_data)
predictions = predictions.withColumn("prediction", when(col("prediction") < 0, 0).otherwise(col("prediction")))
evaluator = RegressionEvaluator(labelCol=TARGET_COL, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"RMSE del modelo final XGBoost = {rmse}")
wmape_df = predictions.agg(
    (spark_sum(spark_abs(col(TARGET_COL) - col("prediction"))) / spark_sum(col(TARGET_COL))).alias("wmape")
)
wmape = wmape_df.collect()[0]["wmape"]
print(f"WMAPE del modelo final XGBoost = {wmape * 100:.2f}%")



Creando una muestra del 30% del dataset completo...


                                                                                

La muestra contiene 2001841 filas.
Configurando el pipeline final con los mejores hiperparámetros...
Dividiendo la muestra para entrenamiento y prueba...


                                                                                

Entrenando el modelo final en las 1503685 filas de la muestra de entrenamiento...


2025-09-18 10:36:22,580 INFO XGBoost-PySpark: _fit Running xgboost-3.0.5 on 1 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'tree_method': 'hist', 'seed': 42, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
25/09/18 10:36:22 WARN DAGScheduler: Broadcasting large task binary with size 1134.6 KiB
2025-09-18 10:36:34,149 INFO XGBoost-PySpark: _train_booster Training on CPUs 1]
[10:36:35] Task 0 got rank 0
[Stage 159:>                                                        (0 + 1) / 1]

In [None]:
# --- 5. GUARDAR EL MODELO ENTRENADO ---
# (Ejecuta este bloque después de entrenar y evaluar tu 'final_model')

print("\nGuardando el pipeline del modelo entrenado...")

# Define la ruta donde quieres guardar el modelo.
# Es buena práctica crear una carpeta específica para los modelos.
model_path = "/home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/models/xgboost_model_on_sample_2"

# Usa el método .save() para guardar el pipeline completo.
# El método .overwrite() se asegura de que puedas volver a guardarlo si haces cambios.
final_model.write().overwrite().save(model_path)

print(f"¡Modelo guardado exitosamente en: {model_path}!")


Guardando el pipeline del modelo entrenado...


                                                                                

¡Modelo guardado exitosamente en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/models/xgboost_model_on_sample!


## Probar modelo

In [6]:
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, when, sum as spark_sum, abs as spark_abs

# --- 1. CARGAR EL MODELO ENTRENADO ---
print("Cargando el pipeline del modelo entrenado...")
model_path = "/home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/models/xgboost_model_on_sample"
final_model = PipelineModel.load(model_path)
print("¡Modelo cargado exitosamente!")

# --- 2. PREPARAR EL CONJUNTO DE PRUEBA (ÚLTIMAS 5 SEMANAS DE 2022) ---
# (Asegúrate de que df_listo_para_modelo esté disponible desde tu notebook anterior)
print("\nFiltrando los datos para obtener las últimas 5 semanas de 2022...")
test_data_final_2022 = df_listo_para_modelo.filter(col("semana") >= 48)
test_data_final_2022.cache() # Cachear para acelerar las siguientes operaciones
print(f"El conjunto de prueba tiene {test_data_final_2022.count()} filas.")

# --- 3. REALIZAR PREDICCIONES ---
print("\nRealizando predicciones sobre el conjunto de prueba...")
predictions = final_model.transform(test_data_final_2022)

# Limpiar las predicciones (asegurarse de que no sean negativas)
predictions = predictions.withColumn("prediction", when(col("prediction") < 0, 0).otherwise(col("prediction")))

# --- 4. EVALUAR EL RENDIMIENTO ---
print("\nCalculando métricas de rendimiento (RMSE y WMAPE)...")

# Calcular RMSE
evaluator = RegressionEvaluator(labelCol="cantidad_total_semanal", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"✅ Root Mean Squared Error (RMSE) = {rmse}")

# Calcular WMAPE
wmape_df = predictions.agg(
    (spark_sum(spark_abs(col("cantidad_total_semanal") - col("prediction"))) / spark_sum(col("cantidad_total_semanal"))).alias("wmape")
)
wmape = wmape_df.collect()[0]["wmape"]
print(f"✅ Weighted Mean Absolute Percentage Error (WMAPE) = {wmape * 100:.2f}%")

# --- 5. MOSTRAR MUESTRA DE PREDICCIONES VS. REALES ---
print("\nMostrando 15 ejemplos de predicciones vs. valores reales:")
predictions.select("semana", "pdv", "produto", "cantidad_total_semanal", "prediction").show(15)

# Liberar la memoria del DataFrame de prueba
test_data_final_2022.unpersist()

Cargando el pipeline del modelo entrenado...
¡Modelo cargado exitosamente!

Filtrando los datos para obtener las últimas 5 semanas de 2022...


                                                                                

El conjunto de prueba tiene 603571 filas.

Realizando predicciones sobre el conjunto de prueba...

Calculando métricas de rendimiento (RMSE y WMAPE)...


25/09/18 09:54:49 WARN DAGScheduler: Broadcasting large task binary with size 1520.0 KiB
2025-09-18 09:54:54,333 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:55:00 WARN DAGScheduler: Broadcasting large task binary with size 1521.1 KiB
25/09/18 09:55:01 WARN DAGScheduler: Broadcasting large task binary with size 1512.5 KiB


✅ Root Mean Squared Error (RMSE) = 7.745593322802074


2025-09-18 09:55:01,170 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

✅ Weighted Mean Absolute Percentage Error (WMAPE) = 23.73%

Mostrando 15 ejemplos de predicciones vs. valores reales:


25/09/18 09:55:08 WARN DAGScheduler: Broadcasting large task binary with size 1510.5 KiB
2025-09-18 09:55:08,882 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs


+------+-------------------+-------------------+----------------------+------------------+
|semana|                pdv|            produto|cantidad_total_semanal|        prediction|
+------+-------------------+-------------------+----------------------+------------------+
|    48|1036357982208856619|1029370090212151375|                   2.0|1.8734204769134521|
|    49|1036357982208856619|1029370090212151375|                   2.0|3.3112733364105225|
|    50|1036357982208856619|1029370090212151375|                   1.0|1.1412255764007568|
|    51|1036357982208856619|1029370090212151375|                   1.0| 1.525991439819336|
|    52|1036357982208856619|1029370090212151375|                   1.0|1.3955345153808594|
|    49|1036357982208856619|1288112997726060694|                   1.0|1.0409570932388306|
|    52|1036357982208856619|1288112997726060694|                   1.0|1.0409570932388306|
|    50|1036357982208856619| 145852603040678098|                   1.0|1.0409570932388306|

DataFrame[semana: int, produto: string, pdv: string, cantidad_total_semanal: double, trimestre: double, categoria: string, label: string, subcategoria: string, marca: string, premise: string, categoria_pdv: string, lag_1: double, lag_2: double, lag_4: double, media_movil_4_semanas: double, stddev_movil_4_semanas: double, mes: bigint, precio_promedio_prod: double, es_semana_importante: int]

## Realizar predicciones con XGB para 2023
fue necesario particionar en 10 tandas para que no ocurriera un OOM

In [None]:
from pyspark.sql.functions import lit, col, when, round, avg, stddev, countDistinct, sum as spark_sum, abs as spark_abs, coalesce
from pyspark.sql.window import Window

# --- 5. PREPARACIÓN PARA PREDICCIÓN ITERATIVA ---
print("\nIniciando preparación para la predicción iterativa de Enero 2023...")

# 1. Seleccionar los candidatos a predecir (igual que antes)
df_reciente = df_listo_para_modelo.filter(col("semana") > 42)
pdv_produto_frecuentes = df_reciente.groupBy("pdv", "produto").agg(countDistinct("semana").alias("num_semanas_vendidas"))
pdv_produto_recientes = pdv_produto_frecuentes.filter(col("num_semanas_vendidas") >= 2).select("pdv", "produto")
pdv_produto_recientes.cache()
print(f"Se encontraron {pdv_produto_recientes.count()} combinaciones activas y frecuentes para la predicción.")

# 2. Preparar el historial de ventas de 2022
# Necesitamos una tabla con [pdv, produto, semana, cantidad] para poder calcular los lags
historial_ventas = df_listo_para_modelo.select("pdv", "produto", "semana", col("cantidad_total_semanal").alias("cantidad"))

# 3. Inicializar el DataFrame de predicciones
predicciones_finales = spark.createDataFrame([], historial_ventas.schema) # DataFrame vacío con la misma estructura

# --- 6. BUCLE DE PREDICCIÓN ITERATIVA ---
for semana_a_predecir in range(1, 6):
    print(f"\n--- Generando predicciones para la Semana {semana_a_predecir} de 2023 ---")
    
    # DataFrame para la semana actual que queremos predecir
    df_semana_actual = pdv_produto_recientes.withColumn("semana", lit(semana_a_predecir))

    # Combinar historial real con las predicciones ya hechas en semanas anteriores
    historial_actualizado = historial_ventas.union(predicciones_finales)

    # Crear la ventana para calcular lags sobre el historial combinado
    windowSpec = Window.partitionBy("pdv", "produto").orderBy(col("semana").desc())

    # Calcular lags dinámicamente
    lags = historial_actualizado.withColumn("row_num", row_number().over(windowSpec)) \
        .groupBy("pdv", "produto") \
        .pivot("row_num", [1, 2, 4]) \
        .agg(first("cantidad")) \
        .withColumnRenamed("1", "lag_1") \
        .withColumnRenamed("2", "lag_2") \
        .withColumnRenamed("4", "lag_4")
        
    # Calcular medias móviles dinámicamente
    window_movil = Window.partitionBy("pdv", "produto").orderBy(col("semana").desc()).rowsBetween(0, 3)
    medias_moviles = historial_actualizado.withColumn("media_movil_4_semanas", avg("cantidad").over(window_movil)) \
                                          .withColumn("stddev_movil_4_semanas", stddev("cantidad").over(window_movil)) \
                                          .groupBy("pdv", "produto") \
                                          .agg(first("media_movil_4_semanas").alias("media_movil_4_semanas"),
                                               first("stddev_movil_4_semanas").alias("stddev_movil_4_semanas"))

    # Ensamblar todas las características para la semana actual
    df_para_predecir = df_semana_actual.join(lags, ["pdv", "produto"], "left") \
                                       .join(medias_moviles, ["pdv", "produto"], "left") \
                                       .withColumn("mes", lit(1)) \
                                       .withColumn("trimestre", lit(1)) \
                                       .withColumn("es_semana_importante", when(col("semana") == 1, 1).otherwise(0))
    
    # Unir features descriptivas
    features_produto_desc = df_listo_para_modelo.select("produto", "categoria", "label", "subcategoria", "marca", "precio_promedio_prod").distinct()
    features_pdv_desc = df_listo_para_modelo.select("pdv", "premise", "categoria_pdv").distinct()
    df_para_predecir = df_para_predecir.join(features_produto_desc, "produto", "left").join(features_pdv_desc, "pdv", "left").fillna(0)
    
    # Realizar la predicción
    predicciones_semana = final_model.transform(df_para_predecir)
    
    # Formatear y guardar la predicción de esta semana
    prediccion_actual = predicciones_semana.select(
        "pdv", "produto", "semana",
        when(col("prediction") < 0, 0).otherwise(round(col("prediction"))).cast("integer").alias("cantidad")
    )
    
    # Añadir la predicción de esta semana al histórico de predicciones
    predicciones_finales = predicciones_finales.union(prediccion_actual)

print("\n--- Predicciones para todas las semanas generadas ---")
pdv_produto_recientes.unpersist()

# --- 7. GUARDAR ARCHIVO DE SUMISIÓN FINAL ---
df_submission = predicciones_finales.filter(col("cantidad") > 0).withColumnRenamed("cantidad", "quantidade")
print(f"Número total de filas a guardar: {df_submission.count()}")
submission_path = "/home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_iterativa_parquet"
print(f"Guardando archivo de submisión en: {submission_path}")

df_submission.repartition(1).write.mode("overwrite").parquet(submission_path)
print("¡Archivo de submisión final generado exitosamente!")

Cargando el pipeline del modelo entrenado...
¡Modelo cargado exitosamente!

Preparando el DataFrame completo para la predicción de Enero...
DataFrame de Enero preparado.
Dividiendo el DataFrame de predicción en 10 partes...

--- Procesando la Parte 1/10 ---


                                                                                

Procesando la Parte 1...


25/09/18 09:34:48 WARN DAGScheduler: Broadcasting large task binary with size 1722.2 KiB
2025-09-18 09:34:56,316 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:34:56,605 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:35:03 WARN DAGScheduler: Broadcasting large task binary with size 1726.7 KiB
                                                                                

Guardando 136415 filas de la Parte 1 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part1


25/09/18 09:35:06 WARN DAGScheduler: Broadcasting large task binary with size 1947.5 KiB
                                                                                

Parte 1 guardada.

--- Procesando la Parte 2/10 ---


                                                                                

Procesando la Parte 2...


25/09/18 09:36:12 WARN DAGScheduler: Broadcasting large task binary with size 1730.6 KiB
2025-09-18 09:36:19,573 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:36:20,298 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:36:27 WARN DAGScheduler: Broadcasting large task binary with size 1735.2 KiB
                                                                                

Guardando 136471 filas de la Parte 2 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part2


25/09/18 09:36:31 WARN DAGScheduler: Broadcasting large task binary with size 1956.0 KiB
                                                                                

Parte 2 guardada.

--- Procesando la Parte 3/10 ---


                                                                                

Procesando la Parte 3...


25/09/18 09:37:35 WARN DAGScheduler: Broadcasting large task binary with size 1722.2 KiB
2025-09-18 09:37:41,880 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:37:41,969 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:37:49 WARN DAGScheduler: Broadcasting large task binary with size 1726.8 KiB
25/09/18 09:37:51 WARN DAGScheduler: Broadcasting large task binary with size 1947.6 KiB


Guardando 135618 filas de la Parte 3 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part3


                                                                                

Parte 3 guardada.

--- Procesando la Parte 4/10 ---


                                                                                

Procesando la Parte 4...


25/09/18 09:38:45 WARN DAGScheduler: Broadcasting large task binary with size 1710.7 KiB
2025-09-18 09:38:45,823 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:38:45,824 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:38:52 WARN DAGScheduler: Broadcasting large task binary with size 1715.2 KiB
                                                                                

Guardando 135913 filas de la Parte 4 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part4


25/09/18 09:38:54 WARN DAGScheduler: Broadcasting large task binary with size 1936.0 KiB
                                                                                

Parte 4 guardada.

--- Procesando la Parte 5/10 ---


                                                                                

Procesando la Parte 5...


25/09/18 09:39:59 WARN DAGScheduler: Broadcasting large task binary with size 1723.7 KiB
2025-09-18 09:40:13,850 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:40:14,979 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:40:20 WARN DAGScheduler: Broadcasting large task binary with size 1728.2 KiB
                                                                                

Guardando 136085 filas de la Parte 5 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part5


25/09/18 09:40:23 WARN DAGScheduler: Broadcasting large task binary with size 1949.0 KiB
                                                                                

Parte 5 guardada.

--- Procesando la Parte 6/10 ---


                                                                                

Procesando la Parte 6...


25/09/18 09:41:22 WARN DAGScheduler: Broadcasting large task binary with size 1724.5 KiB
2025-09-18 09:41:22,364 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:41:22,369 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:41:28 WARN DAGScheduler: Broadcasting large task binary with size 1729.1 KiB
25/09/18 09:41:31 WARN DAGScheduler: Broadcasting large task binary with size 1949.9 KiB


Guardando 136126 filas de la Parte 6 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part6


                                                                                

Parte 6 guardada.

--- Procesando la Parte 7/10 ---


                                                                                

Procesando la Parte 7...


25/09/18 09:42:35 WARN DAGScheduler: Broadcasting large task binary with size 1727.3 KiB
2025-09-18 09:42:35,399 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:42:35,405 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:42:41 WARN DAGScheduler: Broadcasting large task binary with size 1731.9 KiB
                                                                                

Guardando 136035 filas de la Parte 7 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part7


25/09/18 09:42:45 WARN DAGScheduler: Broadcasting large task binary with size 1952.7 KiB
                                                                                

Parte 7 guardada.

--- Procesando la Parte 8/10 ---


                                                                                ]

Procesando la Parte 8...


25/09/18 09:43:38 WARN DAGScheduler: Broadcasting large task binary with size 1684.2 KiB
2025-09-18 09:43:38,962 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:43:38,962 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:43:45 WARN DAGScheduler: Broadcasting large task binary with size 1688.8 KiB
25/09/18 09:43:47 WARN DAGScheduler: Broadcasting large task binary with size 1909.6 KiB


Guardando 136063 filas de la Parte 8 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part8


                                                                                

Parte 8 guardada.

--- Procesando la Parte 9/10 ---


                                                                                ]

Procesando la Parte 9...


25/09/18 09:44:45 WARN DAGScheduler: Broadcasting large task binary with size 1727.3 KiB
2025-09-18 09:44:45,377 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:44:45,383 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:44:52 WARN DAGScheduler: Broadcasting large task binary with size 1731.9 KiB
25/09/18 09:44:54 WARN DAGScheduler: Broadcasting large task binary with size 1952.6 KiB


Guardando 136012 filas de la Parte 9 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part9


                                                                                

Parte 9 guardada.

--- Procesando la Parte 10/10 ---


                                                                                ]

Procesando la Parte 10...


25/09/18 09:46:00 WARN DAGScheduler: Broadcasting large task binary with size 1727.3 KiB
2025-09-18 09:46:15,236 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2025-09-18 09:46:16,076 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
25/09/18 09:46:21 WARN DAGScheduler: Broadcasting large task binary with size 1731.9 KiB
25/09/18 09:46:24 WARN DAGScheduler: Broadcasting large task binary with size 1952.6 KiB


Guardando 136192 filas de la Parte 10 en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part10


                                                                                

Parte 10 guardada.

¡Todas las partes han sido procesadas y guardadas exitosamente!


In [5]:
from pyspark.sql import DataFrame
from functools import reduce

print("Iniciando la unificación de los archivos de predicción...")

# Directorio base que contiene las carpetas de las partes
base_path_temp = "/home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/"

# CAMBIO CLAVE: Usamos un patrón de comodín (*) para leer todas las carpetas 'part'.
# Esto es más robusto y evita problemas si alguna parte está vacía.
path_pattern = f"{base_path_temp}/part*"

# Ruta final para el archivo de sumisión
submission_path_final = "/home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_final_unificada"

# Leer todos los archivos Parquet que coinciden con el patrón
print(f"Leyendo todas las partes desde: {path_pattern}")
df_unificado = spark.read.parquet(path_pattern)

print(f"Total de filas unificadas: {df_unificado.count()}")

# Guardar como un único archivo Parquet usando repartition(1)
print(f"Guardando archivo final unificado en: {submission_path_final}")
df_unificado.repartition(1).write.mode("overwrite").parquet(submission_path_final)

print("¡Archivo de sumisión final unificado y generado exitosamente!")

Iniciando la unificación de los archivos de predicción...
Leyendo todas las partes desde: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp//part*


25/09/18 09:50:58 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp//part*.
java.io.FileNotFoundException: File /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_temp/part* does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catal

Total de filas unificadas: 1360930
Guardando archivo final unificado en: /home/quind/GIT/Desafio-Tecnico-Hackathon-Forecast-Big-Data-2025/submission_final_unificada


[Stage 1580:>                                                       (0 + 1) / 1]

¡Archivo de sumisión final unificado y generado exitosamente!


                                                                                