In [0]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml import Pipeline
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [0]:
fact_bike_pred = (spark.table("hive_metastore.tfm.fact_bike_ml_glm"))

## 1. TRATAMIENTO VALORES NULOS

In [0]:
null_counts = fact_bike_pred.select([
    F.count(F.when(F.col(c).isNull(), 1)).alias(c)
    for c in fact_bike_pred.columns
])

null_counts.show()

+----+-----+-------------+----+----+----+----+----+----+------+----------+--------------+----------+-----------+-----+
|date|total|avg_distancia|awnd|prcp|snow|snwd|tmax|tmin|trange|is_holiday|is_pre_holiday|is_weekend|day_of_week|month|
+----+-----+-------------+----+----+----+----+----+----+------+----------+--------------+----------+-----------+-----+
|   0|    0|          711| 843| 843| 843| 843| 843| 843|   843|         0|             0|         0|          0|    0|
+----+-----+-------------+----+----+----+----+----+----+------+----------+--------------+----------+-----------+-----+



Se eliminarán los 843 registros en los que las columnas categorizadas (bins) presentan valores nulos. Estos casos corresponden a viajes en los que no se dispone de información sobre la estación de origen en la tabla de bicicletas, lo que impide realizar correctamente el join con la estación meteorológica más cercana. En consecuencia, al no conocerse el punto de inicio del trayecto, no es posible asociar de forma fiable las condiciones climáticas correspondientes, por lo que dichos registros se excluyen del análisis.

In [0]:
fact_bike_pred = (
    fact_bike_pred
    .where(F.col('awnd').isNotNull())
)

In [0]:
display(fact_bike_pred.limit(10))

date,total,avg_distancia,awnd,prcp,snow,snwd,tmax,tmin,trange,is_holiday,is_pre_holiday,is_weekend,day_of_week,month
2024-05-29,611,2.901460557216589,10.79869824,5.588,0.0,0.0,26.666666666666668,16.666666666666668,10.0,0,0,0,4,5
2024-09-07,129173,2.400920106199063,1.5,6.1,0.0,0.0,24.4,15.0,9.4,0,0,1,7,9
2024-10-14,120368,2.47135172710864,2.7,0.0,0.0,0.0,18.9,8.9,9.999999999999998,1,0,0,2,10
2024-12-16,4919,1.7996580877721484,15.48188928,27.686,0.0,0.0,10.555555555555555,3.888888888888889,6.666666666666666,0,0,0,2,12
2024-04-19,105049,2.23592020906279,2.3,0.0,0.0,0.0,15.6,7.8,7.8,0,0,0,6,4
2024-10-26,29,2.536577600458364,25.55638272,0.0,0.0,0.0,20.55555555555556,11.11111111111111,9.444444444444446,0,0,1,7,10
2024-05-14,12087,2.2575695644097378,12.247107840000002,0.508,0.0,0.0,22.77777777777778,12.77777777777778,10.0,0,0,0,3,5
2024-08-08,9771,2.101038882969652,17.638410240000002,6.858,0.0,0.0,21.11111111111111,18.333333333333336,2.777777777777775,0,0,0,5,8
2024-10-30,166407,2.5470180419825974,1.5,0.0,0.0,0.0,24.4,14.4,9.999999999999998,0,0,0,4,10
2024-02-23,6081,2.021960371660887,10.07449344,1.27,0.0,0.0,7.777777777777779,4.444444444444445,3.333333333333334,0,0,0,6,2


In [0]:
null_counts = fact_bike_pred.select([
    F.count(F.when(F.col(c).isNull(), 1)).alias(c)
    for c in fact_bike_pred.columns
])

null_counts.show()

+----+-----+-------------+----+----+----+----+----+----+------+----------+--------------+----------+-----------+-----+
|date|total|avg_distancia|awnd|prcp|snow|snwd|tmax|tmin|trange|is_holiday|is_pre_holiday|is_weekend|day_of_week|month|
+----+-----+-------------+----+----+----+----+----+----+------+----------+--------------+----------+-----------+-----+
|   0|    0|            0|   0|   0|   0|   0|   0|   0|     0|         0|             0|         0|          0|    0|
+----+-----+-------------+----+----+----+----+----+----+------+----------+--------------+----------+-----------+-----+



In [0]:
fact_bike_pred.count()

5997

## 2. SELECCIONAR QUE COLUMNAS SERÁN UTILIZADAS EN EL MODELO

In [0]:
fact_bike_pred = (
    fact_bike_pred
    .select(
        'date',
        'total',
        'awnd',
        'prcp',
        'snow',
        'tmax',
        'is_weekend',
        'is_holiday'
    )
)

## 3. ENTRENAMIENTO MODELO GLM POISSON

Vector Assembler

In [0]:
features = [
    "prcp",
    "tmax",
    "awnd",
    "snow",
    "is_weekend",
    "is_holiday"
]

assembler = VectorAssembler(
    inputCols=features,
    outputCol="features"
)

Modelo GLM Poisson

In [0]:
glm = GeneralizedLinearRegression(
    family="poisson",
    link="log",
    labelCol="total",
    featuresCol="features",
    maxIter=50
)

Pipeline

In [0]:
pipeline = Pipeline(stages=[
    assembler,
    glm
])

Training Test

In [0]:
train_df, test_df = fact_bike_pred.randomSplit([0.8, 0.2], seed=42)

Entrenar Modelo

In [0]:
glm_model = pipeline.fit(train_df)

Metricas

In [0]:
summary = glm_model.stages[-1].summary

summary.deviance
summary.aic

54616362.57581789

In [0]:
summary.deviance / summary.residualDegreeOfFreedom

11279.367820944439

Coeficientes

In [0]:
import pandas as pd

coef = glm_model.stages[-1].coefficients.toArray()
intercept = glm_model.stages[-1].intercept

coef_df = pd.DataFrame({
    "feature": features,
    "beta": coef
})

coef_df

Unnamed: 0,feature,beta
0,prcp,-0.009581
1,tmax,0.023768
2,awnd,-0.246384
3,snow,-0.006663
4,is_weekend,-0.068999
5,is_holiday,-0.160758


In [0]:
import numpy as np

coef_df["impact_pct"] = (np.exp(coef_df["beta"]) - 1) * 100
coef_df

Unnamed: 0,feature,beta,impact_pct
0,prcp,-0.009581,-0.95352
1,tmax,0.023768,2.405266
2,awnd,-0.246384,-21.837763
3,snow,-0.006663,-0.664059
4,is_weekend,-0.068999,-6.667271
5,is_holiday,-0.160758,-14.850219


## 4. ENTRENAMIENTO MODELO NEGATIVE BINOMIAL

In [0]:
df = fact_bike_pred.select(
    "total",
    "prcp",
    "tmax",
    "awnd",
    "snow",
    "is_weekend",
    "is_holiday"
).toPandas()

Ajustar modelo

In [0]:
formula = """
total ~ prcp + tmax + awnd + snow + is_weekend + is_holiday
"""

nb_model = smf.glm(
    formula=formula,
    data=df,
    family=sm.families.NegativeBinomial()
).fit()

print(nb_model.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:                  total   No. Observations:                 5997
Model:                            GLM   Df Residuals:                     5990
Model Family:        NegativeBinomial   Df Model:                            6
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -61714.
Date:                Thu, 29 Jan 2026   Deviance:                       17557.
Time:                        12:54:19   Pearson chi2:                 1.60e+04
No. Iterations:                   100   Pseudo R-squ. (CS):             0.8262
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     10.8860      0.037    296.602      0.0

Coeficientes

In [0]:
coef_nb = pd.DataFrame({
    "feature": nb_model.params.index,
    "beta": nb_model.params.values
})

coef_nb["impact_pct"] = (np.exp(coef_nb["beta"]) - 1) * 100
coef_nb

Unnamed: 0,feature,beta,impact_pct
0,Intercept,10.886,5342217.0
1,prcp,-0.003083,-0.3078204
2,tmax,0.010739,1.079679
3,awnd,-0.149561,-13.8914
4,snow,-0.007384,-0.7356668
5,is_weekend,-0.062895,-6.095791
6,is_holiday,-0.25106,-22.20244
