In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import to_date


# BICICLETAS

**TABLA fact_bike**

**Objetivo: Crear una tabla de hechos con los datos de bicicletas**

1) Crear tabla fact_bike

In [0]:
#Mantendré solo las columnas started_date, start_station_id, end_station_id y total. Las columnas de nombres vendrán de la tabla dim_station utilizando los ids de inicio y final.

fact_bike = (
    spark.table("hive_metastore.tfm.datos_brutos_bike")
    .select(
        "started_date",
        "start_station_id",
        "end_station_id",
        "start_lat",
        "start_lng",
        "end_lat",
        "end_lng",
        "total"
    )
    .withColumn("started_date", to_date("started_date"))    
)

display(fact_bike.limit(10))

started_date,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,total
2019-01-19,7311.02,7540.02,40.782939,-73.978652,40.794566,-73.936254,1
2019-02-10,7738.04,SYS035,40.809495,-73.947765,40.72866,-74.01198,1
2019-03-29,5523.02,7858.02,40.722293,-73.991475,40.817166,-73.914737,1
2019-04-02,5746.14,SYS014,40.729538,-73.984267,40.716444,-73.982331,1
2019-04-08,5297.02,7311.02,40.715338,-74.016584,40.782939,-73.978652,1
2019-04-12,6912.01,SYS035,40.766741,-73.979069,40.72866,-74.01198,1
2019-04-18,4461.04,SYS035,40.689407,-73.968855,40.72866,-74.01198,1
2019-04-18,7756.1,7971.07,40.811432,-73.951878,40.82281,-73.937413,1
2019-04-18,4617.01,SYS035,40.692371,-73.937054,40.72866,-74.01198,1
2019-04-21,4936.01,SYS035,40.702551,-73.989402,40.72866,-74.01198,1


2. Calcular distancia entre las estaciones de inicio y fin

In [0]:
fact_bike = (
    fact_bike
    .withColumn(
        "distancia",
        2 * 6371 * F.asin(
            F.sqrt(
                F.pow(
                    F.sin((F.radians(F.col("end_lat")) - F.radians(F.col("start_lat"))) / 2),
                    2
                ) +
                F.cos(F.radians(F.col("start_lat"))) *
                F.cos(F.radians(F.col("end_lat"))) *
                F.pow(
                    F.sin((F.radians(F.col("end_lng")) - F.radians(F.col("start_lng"))) / 2),
                    2
                )
            )
        )
    ).drop('start_lat', 'end_lat', 'start_lng', 'end_lng')
)

In [0]:
display(fact_bike.limit(10))

started_date,start_station_id,end_station_id,total,distancia
2022-10-25,5636.11,5190.07,1,1.7163172839369876
2022-10-25,4028.04,3905.15,1,0.7168435285935274
2022-10-25,6459.04,5190.07,1,4.384248964768999
2022-10-25,8353.02,8316.02,1,0.175753197253727
2022-10-25,7602.05,6779.04,1,4.816698458138459
2022-10-25,6206.08,6611.05,1,1.9143069730841211
2022-10-25,5805.05,5922.08,3,0.5363583383621947
2022-10-25,5653.12,5351.03,1,1.2791972975368635
2022-10-25,6364.07,6890.06,2,1.7722705929554523
2022-10-25,5270.08,6602.03,1,4.727696652746611


In [0]:
#Por último, antes de guardar renomear nombres de columnas para que sea más sencillo entender las tablas

fact_bike = (
    fact_bike
    .withColumnRenamed('start_station_id','bike_start_station_id')
    .withColumnRenamed('end_station_id','bike_end_station_id')
)

In [0]:
display(fact_bike.limit(10))

started_date,bike_start_station_id,bike_end_station_id,total,distancia
2022-10-25,5636.11,5190.07,1,1.7163172839369876
2022-10-25,4028.04,3905.15,1,0.7168435285935274
2022-10-25,6459.04,5190.07,1,4.384248964768999
2022-10-25,8353.02,8316.02,1,0.175753197253727
2022-10-25,7602.05,6779.04,1,4.816698458138459
2022-10-25,6206.08,6611.05,1,1.9143069730841211
2022-10-25,5805.05,5922.08,3,0.5363583383621947
2022-10-25,5653.12,5351.03,1,1.2791972975368635
2022-10-25,6364.07,6890.06,2,1.7722705929554523
2022-10-25,5270.08,6602.03,1,4.727696652746611


2) Crear tabla agg para facilitar futuros calculos

In [0]:
fact_bike_agg_station = (
    fact_bike
    .select(
        F.col('started_date').alias('date'),
        'bike_start_station_id',
        'total',
        'distancia'
    )
    .groupBy(
        'date',
        'bike_start_station_id'
    )
    .agg(
        F.sum('total').alias('sum_total'),
        F.avg('total').alias('avg_total'),
        F.sum('distancia').alias('sum_distancia'),
        F.avg('distancia').alias('avg_distancia')
    )
)

In [0]:
fact_bike_agg_station.count()

2876674

In [0]:
fact_bike_agg = (
    fact_bike
    .select(
        F.col('started_date').alias('date'),
        'total',
        'distancia'
    )
    .groupBy('date')
    .agg(
        F.sum('total').alias('sum_total'),
        F.avg('total').alias('avg_total'),
        F.sum('distancia').alias('sum_distancia'),
        F.avg('distancia').alias('avg_distancia')
    )
)

In [0]:
fact_bike_agg.count()

1993

3) Guardar tabla fact_bike en el catalogo

In [0]:
fact_bike_agg_station.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("tfm.fact_bike_agg_station")

In [0]:
fact_bike_agg.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("tfm.fact_bike_agg")

In [0]:
fact_bike.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("tfm.fact_bike")