In [0]:
from pyspark.sql import functions as F

In [0]:
fact_bike_ml_glm = (spark.table("hive_metastore.tfm.fact_bike"))
dim_bike = (spark.table("hive_metastore.tfm.dim_bike"))
weather_station_glm = spark.table('hive_metastore.tfm.fact_weather_enriched_glm').alias('weather')
dim_calendar = (spark.table("hive_metastore.tfm.dim_calendar"))

fact_bike_ml_glm.cache()
dim_bike.cache()
weather_station_glm.cache()
dim_calendar.cache()

DataFrame[date: date, is_holiday: int, is_pre_holiday: int, is_weekend: int, day_of_week: int, month: int]

1. JOIN DIM BIKE

In [0]:
fact = spark.table('hive_metastore.tfm.fact_bike')
dim_bike = spark.table('hive_metastore.tfm.dim_bike').alias('dim')

fact_bike_ml_glm = (
    fact.alias('fact')
    .join(
        dim_bike,
        F.col('fact.bike_start_station_id') == F.col('dim.bike_station_id'),
        'left'
    )
    .select(
        F.col('fact.started_date'),
        F.col('fact.bike_start_station_id'),
        F.col('fact.bike_end_station_id'),
        F.col('fact.total'),
        F.col('fact.distancia'),
        F.col('dim.weather_station_id').alias('weather_station_id')
    )
)

2. Crear columna concatenada para el JOIN con la tabla de weather

In [0]:
fact_bike_ml_glm = (
    fact_bike_ml_glm
    .withColumn(
        'weather_station_date',
        F.concat_ws('/', F.col('weather_station_id'), F.col('started_date'))
    )
)

3. Agrupar para reducir numero de filas

In [0]:
fact_bike_ml_glm = (
    fact_bike_ml_glm
    .groupBy(
        'started_date',
        'weather_station_date'
    )
    .agg(
        F.sum(F.col('total')).alias('total'),
        F.avg(F.col('distancia')).alias('avg_distancia')
    )
)

In [0]:
fact_bike_ml_glm.count()

7017

4. JOIN WEATHER

In [0]:
fact = fact_bike_ml_glm.alias('fact')
weather = weather_station_glm.alias('weather')

fact_bike_ml_glm = (
    fact
    .join(
        weather_station_glm,
        on = 'weather_station_date',
        how = 'left'
    )
    .select(
        F.col('fact.started_date'),
        F.col('fact.total'),
        F.col('fact.avg_distancia'),

        F.col('weather.awnd').alias('awnd'),
        F.col('weather.prcp').alias('prcp'),
        F.col('weather.snow').alias('snow'),
        F.col('weather.snwd').alias('snwd'),
        F.col('weather.tmax').alias('tmax'),
        F.col('weather.tmin').alias('tmin'),
        F.col('weather.trange').alias('trange')
    ).where(F.year('started_date') >= 2020)
)

5. JOIN HOLIDAY

In [0]:
fact = fact_bike_ml_glm.alias('fact')
calendar = dim_calendar.alias('calendar')

fact_bike_ml_glm = (
    fact
    .join(
        calendar,
        fact['started_date'] == calendar['date'],
        how='left'
    )
    .select(
        F.col('fact.started_date').alias('date'),
        F.col('fact.total'),
        F.col('fact.avg_distancia'),
        F.col('fact.awnd'),
        F.col('fact.prcp'),
        F.col('fact.snow'),
        F.col('fact.snwd'),
        F.col('fact.tmax'),
        F.col('fact.tmin'),
        F.col('fact.trange'),
        F.col('calendar.is_holiday'),
        F.col('calendar.is_pre_holiday'),
        F.col('calendar.is_weekend'),
        F.col('calendar.day_of_week'),
        F.col('calendar.month')
    )
)

In [0]:
display(fact_bike_ml_glm.limit(10))

date,total,avg_distancia,awnd,prcp,snow,snwd,tmax,tmin,trange,is_holiday,is_pre_holiday,is_weekend,day_of_week,month
2024-05-29,611,2.901460557216589,10.79869824,5.588,0.0,0.0,26.666666666666668,16.666666666666668,10.0,0,0,0,4,5
2024-09-07,129173,2.400920106199063,1.5,6.1,0.0,0.0,24.4,15.0,9.4,0,0,1,7,9
2024-10-14,120368,2.47135172710864,2.7,0.0,0.0,0.0,18.9,8.9,9.999999999999998,1,0,0,2,10
2024-12-16,4919,1.7996580877721484,15.48188928,27.686,0.0,0.0,10.555555555555555,3.888888888888889,6.666666666666666,0,0,0,2,12
2024-04-19,105049,2.23592020906279,2.3,0.0,0.0,0.0,15.6,7.8,7.8,0,0,0,6,4
2024-12-09,46,,,,,,,,,0,0,0,2,12
2024-10-26,29,2.536577600458364,25.55638272,0.0,0.0,0.0,20.55555555555556,11.11111111111111,9.444444444444446,0,0,1,7,10
2024-12-19,30,,,,,,,,,0,0,0,5,12
2024-07-29,87,,,,,,,,,0,0,0,2,7
2024-05-14,12087,2.2575695644097378,12.247107840000002,0.508,0.0,0.0,22.77777777777778,12.77777777777778,10.0,0,0,0,3,5


In [0]:
fact_bike_ml_glm.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true").option("overwrite", "true") \
    .saveAsTable("tfm.fact_bike_ml_glm")