In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# BICICLETAS

**TABLA dim_station**

**Objetivo: Crear una tabla dimensión de par valores station_id y station_name únicos**

1) Crear tabla

Para crear la tabla dim_bike llevaré en cuenta solo las estaciones de entrada y salida que tuvieron movimientos en el año de 2024

In [0]:
# Creando tablas de ids / nombres unicos estación de entrada 
dim_station_start = (
    spark.table("hive_metastore.tfm.datos_brutos_bike")
    .select(
        F.col("start_station_id").alias("station_id"),
        F.col("start_station_name").alias("station_name"),
        F.round(F.col("start_lat"),4).alias("lat"),
        F.round(F.col("start_lng"),4).alias("lng")
    )
    .filter(F.col("station_id").isNotNull())
    #.filter(F.year(F.col("started_date")) == 2024)
    .distinct()
)

# Creando tablas de ids / nombres unicos estación de salida
dim_station_end = (
    spark.table("hive_metastore.tfm.datos_brutos_bike")
    .select(
        F.col("end_station_id").alias("station_id"),
        F.col("end_station_name").alias("station_name"),
        F.round(F.col("end_lat"),4).alias("lat"),
        F.round(F.col("end_lng"),4).alias("lng")
    )
    .filter(F.col("station_id").isNotNull())
    #.filter(F.year(F.col("started_date")) == 2024)
    .distinct()
)

# Anexar y quitar valores duplicados de las tablas start y end para crear la tabla dim_station
dim_bike = (
    dim_station_start.union(dim_station_end)
).distinct()

display(dim_bike)

station_id,station_name,lat,lng
6310.06,27 St & Hunter St,40.7485,-73.9413
3457.03,Prospect Park SW & Vanderbilt St,40.6548,-73.9732
7992.04,Bradhurst Ave & W 148 St,40.8251,-73.9416
6289.06,Broadway & W 29 St,40.7461,-73.9885
5785.10,Kent St & McGuinness Blvd,40.7312,-73.9516
6593.15,6 Ave & W 45 St,40.7571,-73.982
6737.03,36 Ave & 10 St,40.7614,-73.9411
7606.01,W 106 St & Central Park West,40.7983,-73.9607
6474.12,E 41 St & Madison Ave (SW corner),40.7521,-73.9801
5024.10,Water St & Fletcher St,40.7062,-74.0058


In [0]:
dim_bike = dim_bike.cache()

2) Limpieza de datos

In [0]:
window = Window.partitionBy('station_id').orderBy('station_id')

dim_bike = (
    dim_bike
    .withColumn("rn", row_number().over(window))
    .filter("rn = 1")
    .drop("rn")
)

In [0]:
display(dim_bike.limit(10))

station_id,station_name,lat,lng
2821.05,7 Ave & 62 St,40.6356,-74.013
2832.03,4 Ave & Shore Road Dr,40.6369,-74.0223
2898.01,Cortelyou Rd & Stratford Rd,40.6397,-73.9681
2951.05,55 St & 7 Ave,40.6398,-74.0089
2961.05,59 St & 5 Ave,40.6404,-74.0156
2984.04,Cortelyou Rd & Argyle Rd,40.6403,-73.966
3019.02,65 St & 2 Ave,40.6405,-74.0257
3070.04,E 16 St & Cortelyou Rd,40.6417,-73.9636
3113.06,Rugby Rd & Beverley Rd,40.6444,-73.9659
3132.09,Beverley Rd & Nostrand Ave,40.6451,-73.9488


In [0]:
#Comprobar si hay station_id repetidos, es decir un mismo station_id con mas de un station_name distinto

df_ids_repetidos = (
    dim_bike
    .groupBy(F.col("station_id"))
    .agg(F.count("*").alias("count"))
    .filter(F.col("count") > 1)
    .orderBy(F.col("count").desc())
).drop("count")

df = (
    dim_bike.join(df_ids_repetidos, on = 'station_id', how = 'inner')
).orderBy("station_id")

display(df)

station_id,station_name,lat,lng


3) Añadiendo datos de la estación meteorológica mas cercana.

In [0]:
station_weather = (spark.table("hive_metastore.tfm.dim_weather"))

In [0]:
display(station_weather)

weather_station_id,name,latitude,longitude
USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",40.68275,-74.16927
USW00094728,"NY CITY CENTRAL PARK, NY US",40.77898,-73.96925
USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027
USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",40.63915,-73.7639


In [0]:
#Cogiendo valores de latitud y longitud de las 4 estaciones meteorológicas

#EWR
latitude_ewr = (
    station_weather
    .select(F.col('latitude'))
    .where(F.col('weather_station_id') == 'USW00014734')
    .first()['latitude']
)

longitude_ewr = (
    station_weather
    .select(F.col('longitude'))
    .where(F.col('weather_station_id') == 'USW00014734')
    .first()['longitude']
)

#CENTRAL PARK
latitude_cp = (
    station_weather
    .select(F.col('latitude'))
    .where(F.col('weather_station_id') == 'USW00094728')
    .first()['latitude']
)

longitude_cp = (
    station_weather
    .select(F.col('longitude'))
    .where(F.col('weather_station_id') == 'USW00094728')
    .first()['longitude']
)

#LGA
latitude_lga = (
    station_weather
    .select(F.col('latitude'))
    .where(F.col('weather_station_id') == 'USW00014732')
    .first()['latitude']
)

longitude_lga = (
    station_weather
    .select(F.col('longitude'))
    .where(F.col('weather_station_id') == 'USW00014732')
    .first()['longitude']
)

#jfk
latitude_jfk = (
    station_weather
    .select(F.col('latitude'))
    .where(F.col('weather_station_id') == 'USW00094789')
    .first()['latitude']
)

longitude_jfk = (
    station_weather
    .select(F.col('longitude'))
    .where(F.col('weather_station_id') == 'USW00094789')
    .first()['longitude']
)

print(latitude_ewr)
print(longitude_ewr)

print(latitude_cp)
print(longitude_cp)

print(latitude_lga)
print(longitude_lga)

print(latitude_jfk)
print(longitude_jfk)

40.68275
-74.16927
40.77898
-73.96925
40.77945
-73.88027
40.63915
-73.7639


In [0]:
#Calculando distancia a las estaciones

dim_bike = (
    dim_bike
    .withColumn(
        "distancia_ewr",
        2 * 6371 * F.asin(
            F.sqrt(
                F.sin((F.radians(F.lit(latitude_ewr)) - F.radians(F.col("lat"))) / 2) ** 2 +
                F.cos(F.radians(F.col("lat"))) *
                F.cos(F.radians(F.lit(latitude_ewr))) *
                F.sin((F.radians(F.lit(longitude_ewr)) - F.radians(F.col("lng"))) / 2) ** 2
            )
        )
    )
    .withColumn(
        "distancia_cp",
        2 * 6371 * F.asin(
            F.sqrt(
                F.sin((F.radians(F.lit(latitude_cp)) - F.radians(F.col("lat"))) / 2) ** 2 +
                F.cos(F.radians(F.col("lat"))) *
                F.cos(F.radians(F.lit(latitude_cp))) *
                F.sin((F.radians(F.lit(longitude_cp)) - F.radians(F.col("lng"))) / 2) ** 2
            )
        )
    )
    .withColumn(
        "distancia_lga",
        2 * 6371 * F.asin(
            F.sqrt(
                F.sin((F.radians(F.lit(latitude_lga)) - F.radians(F.col("lat"))) / 2) ** 2 +
                F.cos(F.radians(F.col("lat"))) *
                F.cos(F.radians(F.lit(latitude_lga))) *
                F.sin((F.radians(F.lit(longitude_lga)) - F.radians(F.col("lng"))) / 2) ** 2
            )
        )
    )
    .withColumn(
        "distancia_jfk",
        2 * 6371 * F.asin(
            F.sqrt(
                F.sin((F.radians(F.lit(latitude_jfk)) - F.radians(F.col("lat"))) / 2) ** 2 +
                F.cos(F.radians(F.col("lat"))) *
                F.cos(F.radians(F.lit(latitude_jfk))) *
                F.sin((F.radians(F.lit(longitude_jfk)) - F.radians(F.col("lng"))) / 2) ** 2
            )
        )
    )
)

In [0]:
display(dim_bike)

station_id,station_name,lat,lng,distancia_ewr,distancia_cp,distancia_lga,distancia_jfk
2821.05,7 Ave & 62 St,40.6356,-74.013,14.186104479269218,16.364071553645843,19.519807158672624,21.02273138063469
2832.03,4 Ave & Shore Road Dr,40.6369,-74.0223,13.404536655128927,16.41920573300649,19.86382996902443,21.804980335857547
2898.01,Cortelyou Rd & Stratford Rd,40.6397,-73.9681,17.63091929904556,15.487532721518113,17.21281689556299,17.229950307600493
2951.05,55 St & 7 Ave,40.6398,-74.0089,14.345466520388769,15.832853999242484,18.93884236167196,20.672543857753933
2961.05,59 St & 5 Ave,40.6404,-74.0156,13.79086423853924,15.896924461643495,19.21394346769289,21.23810384852844
2984.04,Cortelyou Rd & Argyle Rd,40.6403,-73.966,17.783565855493595,15.422945412647977,17.076946841624558,17.05305235711229
3019.02,65 St & 2 Ave,40.6405,-74.0257,12.989364965148289,16.11664036173001,19.72251653812012,22.090347922154304
3070.04,E 16 St & Cortelyou Rd,40.6417,-73.9636,17.938405267038373,15.272266228020268,16.85067730571698,16.852277295324363
3113.06,Rugby Rd & Beverley Rd,40.6444,-73.9659,17.675700901850604,14.96727680108155,16.661241301529444,17.053606472814614
3132.09,Beverley Rd & Nostrand Ave,40.6451,-73.9488,19.061258040604063,14.98622850422162,16.016794499490974,15.61475249608116


In [0]:
#Calculando estacion meteorologica mas cercana a la estacion de bici

dim_bike = (
    dim_bike
        .withColumn(
            "estacion_mas_cercana",
            F.array_sort(
                F.array(

                    F.struct(
                        F.col("distancia_ewr").alias("dist"),
                        F.lit("distancia_ewr").alias("nombre")
                    ),

                    F.struct(
                        F.col("distancia_cp").alias("dist"),
                        F.lit("distancia_cp").alias("nombre")
                    ),

                    F.struct(
                        F.col("distancia_lga").alias("dist"),
                        F.lit("distancia_lga").alias("nombre")
                    ),

                    F.struct(
                        F.col("distancia_jfk").alias("dist"),
                        F.lit("distancia_jfk").alias("nombre")
                    )

                )
            )[0]["nombre"]   
        )
)

In [0]:
display(dim_bike)

station_id,station_name,lat,lng,distancia_ewr,distancia_cp,distancia_lga,distancia_jfk,estacion_mas_cercana
2821.05,7 Ave & 62 St,40.6356,-74.013,14.186104479269218,16.364071553645843,19.519807158672624,21.02273138063469,distancia_ewr
2832.03,4 Ave & Shore Road Dr,40.6369,-74.0223,13.404536655128927,16.41920573300649,19.86382996902443,21.804980335857547,distancia_ewr
2898.01,Cortelyou Rd & Stratford Rd,40.6397,-73.9681,17.63091929904556,15.487532721518113,17.21281689556299,17.229950307600493,distancia_cp
2951.05,55 St & 7 Ave,40.6398,-74.0089,14.345466520388769,15.832853999242484,18.93884236167196,20.672543857753933,distancia_ewr
2961.05,59 St & 5 Ave,40.6404,-74.0156,13.79086423853924,15.896924461643495,19.21394346769289,21.23810384852844,distancia_ewr
2984.04,Cortelyou Rd & Argyle Rd,40.6403,-73.966,17.783565855493595,15.422945412647977,17.076946841624558,17.05305235711229,distancia_cp
3019.02,65 St & 2 Ave,40.6405,-74.0257,12.989364965148289,16.11664036173001,19.72251653812012,22.090347922154304,distancia_ewr
3070.04,E 16 St & Cortelyou Rd,40.6417,-73.9636,17.938405267038373,15.272266228020268,16.85067730571698,16.852277295324363,distancia_cp
3113.06,Rugby Rd & Beverley Rd,40.6444,-73.9659,17.675700901850604,14.96727680108155,16.661241301529444,17.053606472814614,distancia_cp
3132.09,Beverley Rd & Nostrand Ave,40.6451,-73.9488,19.061258040604063,14.98622850422162,16.016794499490974,15.61475249608116,distancia_cp


In [0]:
#Añadiendo datos de id y eliminando columnas que ya no usaré

dim_bike = (
    dim_bike
    .withColumn(
        'weather_station_id',
        F.when(F.col('estacion_mas_cercana') == 'distancia_ewr', 'USW00014734')
        .when(F.col('estacion_mas_cercana') == 'distancia_cp', 'USW00094728')
        .when(F.col('estacion_mas_cercana') == 'distancia_lga', 'USW00014732')
        .when(F.col('estacion_mas_cercana') == 'distancia_jfk', 'USW00094789')
    )
    .drop(
        "distancia_ewr",
        "distancia_cp",
        "distancia_lga",
        "distancia_jfk",
        'estacion_mas_cercana'
    )
)

In [0]:
display(dim_bike)

station_id,station_name,lat,lng,weather_station_id
2821.05,7 Ave & 62 St,40.6356,-74.013,USW00014734
2832.03,4 Ave & Shore Road Dr,40.6369,-74.0223,USW00014734
2898.01,Cortelyou Rd & Stratford Rd,40.6397,-73.9681,USW00094728
2951.05,55 St & 7 Ave,40.6398,-74.0089,USW00014734
2961.05,59 St & 5 Ave,40.6404,-74.0156,USW00014734
2984.04,Cortelyou Rd & Argyle Rd,40.6403,-73.966,USW00094728
3019.02,65 St & 2 Ave,40.6405,-74.0257,USW00014734
3070.04,E 16 St & Cortelyou Rd,40.6417,-73.9636,USW00094728
3113.06,Rugby Rd & Beverley Rd,40.6444,-73.9659,USW00094728
3132.09,Beverley Rd & Nostrand Ave,40.6451,-73.9488,USW00094728


4) Guardar tabla dim_station en el catalogo

In [0]:
#Por último, antes de guardar renomear nombres de columnas para que sea más sencillo entender las tablas

dim_bike = (
    dim_bike
    .withColumnRenamed('station_id','bike_station_id')
    .withColumnRenamed('station_name','bike_station_name')
)

In [0]:
display(dim_bike.limit(10))

bike_station_id,bike_station_name,lat,lng,weather_station_id
2821.05,7 Ave & 62 St,40.6356,-74.013,USW00014734
2832.03,4 Ave & Shore Road Dr,40.6369,-74.0223,USW00014734
2898.01,Cortelyou Rd & Stratford Rd,40.6397,-73.9681,USW00094728
2951.05,55 St & 7 Ave,40.6398,-74.0089,USW00014734
2961.05,59 St & 5 Ave,40.6404,-74.0156,USW00014734
2984.04,Cortelyou Rd & Argyle Rd,40.6403,-73.966,USW00094728
3019.02,65 St & 2 Ave,40.6405,-74.0257,USW00014734
3070.04,E 16 St & Cortelyou Rd,40.6417,-73.9636,USW00094728
3113.06,Rugby Rd & Beverley Rd,40.6444,-73.9659,USW00094728
3132.09,Beverley Rd & Nostrand Ave,40.6451,-73.9488,USW00094728


In [0]:
dim_bike.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("tfm.dim_bike")