In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
import utils as utils
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from xgboost.spark import SparkXGBClassifier
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from builtins import min as python_min

In [2]:
spark = utils.create_context()

In [3]:
# Load weather data
#print("  Loading weather data...")
#df_weather = utils.read_iceberg_table( spark, "trusted", "aemetTrustedDiario")
#
#weather_count = df_weather.count()
#print(f"    Weather records: {weather_count}")

# Load hotel occupancy data
print("  Loading hotel occupancy data...")
df_hotels = utils.read_iceberg_table(spark=spark, db_name="exploitation", table_name="f_ocupacion_barcelona")
hotel_count = df_hotels.count()
print(f"    Hotel records: {hotel_count}")

print("✅ Data loaded successfully")

  Loading hotel occupancy data...
    Hotel records: 265
✅ Data loaded successfully


In [4]:
def create_hotel_features(df_hotels):
        """Create hotel-based features."""
        print("  Creating hotel features...")
        
        df_hotel_features = df_hotels.groupBy(
            col('año'),
            col('mes')
        ).agg(
            sum('viajeros').alias('hotel_viajeros'),
            sum('pernoctaciones').alias('hotel_pernoctaciones'),
            avg('estanciaMedia').alias('hotel_estancia_media'),
            avg('gradoOcupaPlazas').alias('avg_ocupacion')
        ).withColumn(
            # Hotel availability score
            'hotel_availability_score',
            100 - col('avg_ocupacion')
        )
        
        return df_hotel_features

In [7]:
#2. Generar features a partir de df_hotels
df_hotel_features = create_hotel_features(df_hotels)

# 🚨 Aquí necesitas una etiqueta (label).
# Ejemplo: 1 si es "buen momento", 0 si no. 
# Esto normalmente lo defines con criterios propios.
# Supongamos que un buen momento es cuando hotel_availability_score > 40
df_labeled = df_hotel_features.withColumn(
    "label", (col("hotel_availability_score") > 40).cast("int")
)

# 3. Seleccionar las features para el modelo
feature_cols = [
    "hotel_viajeros",
    "hotel_pernoctaciones",
    "hotel_estancia_media",
    "avg_ocupacion",
    "hotel_availability_score"
]

# VectorAssembler para convertir a vector de features
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)
# Contar positivos (label=1) y negativos (label=0)
counts = df_labeled.groupBy("label").count().collect()

# Inicializamos variables
num_positivos = 0
num_negativos = 0

for row in counts:
    if row['label'] == 1:
        num_positivos = row['count']
    else:
        num_negativos = row['count']

# Ratio para scale_pos_weight
ratio_negativos_sobre_positivos = num_negativos / num_positivos
# 4. Definir modelo XGBoost
xgb = SparkXGBClassifier(
    features_col="features",       # <- snake_case
    label_col="label",
    prediction_col="prediction",
    probability_col="probability",
    num_round=50,
    max_depth=5,
    eta=0.1,
    scale_pos_weight=ratio_negativos_sobre_positivos
)

# 5. Construir pipeline
pipeline = Pipeline(stages=[assembler, xgb])

# 6. Entrenar modelo
train, test = df_labeled.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train)

# 7. Evaluar modelo
predictions = model.transform(test)
predictions.select("año", "mes", "label", "probability", "prediction").show(10, truncate=False)

  Creating hotel features...


2025-08-26 18:20:45,701 INFO XGBoost-PySpark: _fit Running xgboost-3.0.4 on 1 workers with
	booster params: {'objective': 'binary:logistic', 'device': 'cpu', 'max_depth': 5, 'scale_pos_weight': 0.9506172839506173, 'num_round': 50, 'eta': 0.1, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2025-08-26 18:20:52,976 INFO XGBoost-PySpark: _fit Finished xgboost training!


+----+---+-----+-----------------------------------------+----------+
|año |mes|label|probability                              |prediction|
+----+---+-----+-----------------------------------------+----------+
|2012|3  |0    |[0.015503287315368652,0.9844967126846313]|1.0       |
|2012|7  |0    |[0.9854826927185059,0.01451731938868761] |0.0       |
|2012|9  |0    |[0.9854826927185059,0.01451731938868761] |0.0       |
|2013|2  |1    |[0.015503287315368652,0.9844967126846313]|1.0       |
|2013|8  |0    |[0.9854826927185059,0.01451731938868761] |0.0       |
|2013|12 |1    |[0.015503287315368652,0.9844967126846313]|1.0       |
|2014|6  |0    |[0.9854826927185059,0.01451731938868761] |0.0       |
|2014|12 |1    |[0.015503287315368652,0.9844967126846313]|1.0       |
|2015|10 |0    |[0.9854826927185059,0.01451731938868761] |0.0       |
|2015|11 |1    |[0.015503287315368652,0.9844967126846313]|1.0       |
+----+---+-----+-----------------------------------------+----------+
only showing top 10 

In [8]:
from pyspark.sql import Row
from pyspark.sql.functions import col

# Supongamos que queremos predecir para Barcelona, 20/09/2025
ciudad = "Barcelona"
fecha = "2025-09-20"
year, month, day = map(int, fecha.split("-"))

# Generar features para esa fecha
# En tu caso, solo agregados históricos por mes y año
# Aquí podemos usar medias históricas del mes de septiembre
df_month_avg = df_hotel_features.filter(col("mes") == month).agg(
    avg("hotel_viajeros").alias("hotel_viajeros"),
    avg("hotel_pernoctaciones").alias("hotel_pernoctaciones"),
    avg("hotel_estancia_media").alias("hotel_estancia_media"),
    avg("avg_ocupacion").alias("avg_ocupacion"),
    avg("hotel_availability_score").alias("hotel_availability_score")
).collect()[0]

# Crear fila para predicción
future_row = Row(
    año=year,
    mes=month,
    hotel_viajeros=df_month_avg["hotel_viajeros"],
    hotel_pernoctaciones=df_month_avg["hotel_pernoctaciones"],
    hotel_estancia_media=df_month_avg["hotel_estancia_media"],
    avg_ocupacion=df_month_avg["avg_ocupacion"],
    hotel_availability_score=df_month_avg["hotel_availability_score"]
)

# Convertir a DataFrame Spark
future_df = spark.createDataFrame([future_row])

# Aplicar pipeline entrenado
prediction = model.transform(future_df)

# Mostrar resultado
prediction.select(
    "año", "mes", "prediction", "probability"
).show(truncate=False)


+----+---+----------+----------------------------------------+
|año |mes|prediction|probability                             |
+----+---+----------+----------------------------------------+
|2025|9  |0.0       |[0.9854826927185059,0.01451731938868761]|
+----+---+----------+----------------------------------------+

