# GENERAR TABLAS

Objetivo: Generar tablas de bicicletas, metro y autobús que serán utilizadas para el análisis de datos y la creación de visualizaciones mediante herramientas de Business Intelligence.

In [0]:
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import StringType

## 1. BICICLETAS

In [0]:
%sql
CREATE OR REPLACE TABLE tfm.fact_bike_bi
SELECT
  fact.date,
  fact.sum_total,
  fact.sum_distancia,
  fact.avg_distancia,
  weather.awnd,
  weather.prcp,
  weather.snow,
  weather.snwd,
  weather.tmax,
  weather.tmin
FROM tfm.fact_bike_agg fact
LEFT JOIN (SELECT * FROM tfm.fact_weather WHERE weather_station_id = 'USW00094789') weather ON fact.date = weather.date

-- Objetivo: Crear una tabla integrada para su posterior análisis en Power BI, combinando información del sistema de bicicletas y variables climáticas, con el fin de generar visualizaciones que permitan analizar el comportamiento de la demanda en función de las condiciones meteorológicas.

num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM tfm.fact_bike_bi

date,sum_total,sum_distancia,avg_distancia,awnd,prcp,snow,snwd,tmax,tmin
2022-11-29,75411,122792.26255568162,1.939815525120956,7.19376768,0.0,0.0,0.0,6.666666666666667,1.1111111111111112
2022-12-25,9164,12983.138525696617,1.5406596090775622,32.39609472,0.0,0.0,0.0,-2.2222222222222223,-10.0
2023-02-25,43325,60596.382398655456,1.819383366320046,11.15275392,0.0,0.0,0.0,0.5555555555555556,-5.555555555555555
2022-11-21,59696,94497.57800638564,1.8629389454191352,23.04580608,0.0,0.0,0.0,5.555555555555555,-2.2222222222222223
2023-02-08,78636,117912.1478393792,2.0875318291795764,18.36261504,0.254,0.0,0.0,12.222222222222223,1.1111111111111112
2022-12-09,76825,121288.41245497172,1.902981242233145,21.9675456,0.0,0.0,0.0,8.333333333333334,1.1111111111111112
2023-03-12,56572,84228.1593344498,2.018020972122521,15.48188928,0.0,0.0,0.0,7.777777777777779,0.0
2022-12-02,74951,120554.06586169692,1.898339750597542,10.79869824,0.0,0.0,0.0,7.222222222222222,-0.5555555555555556
2023-03-24,77837,108598.52767823495,1.9643398331958928,16.206094080000003,0.254,0.0,0.0,13.88888888888889,5.0
2022-12-20,58639,93079.16733163856,1.8278772894159412,21.2433408,0.0,0.0,0.0,4.444444444444445,-1.1111111111111112


In [0]:
%sql
CREATE OR REPLACE TABLE tfm.fact_bike_station_bi
SELECT
  fact.date,
  dim.bike_station_name,
  sum_total,
  avg_distancia
FROM tfm.fact_bike_agg_station fact
LEFT JOIN tfm.dim_bike dim ON fact.bike_start_station_id = dim.bike_station_id

-- Objetivo: Crear una tabla que incluya los nombres de las estaciones de bicicletas con el fin de realizar un análisis temporal del comportamiento de la demanda de usuarios.

num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM tfm.fact_bike_station_bi

date,bike_station_name,sum_total,avg_distancia
2022-10-25,Allen St & Hester St,164,1.9311639753025904
2022-10-25,South St & Pike St,57,1.7215976445414916
2022-10-25,W 35 St & 8 Ave,114,2.239810372428175
2022-10-25,Clinton St & Cherry St,90,1.8475446349989069
2022-10-25,Valentine Ave & E 183 St,4,0.8103440021515345
2022-10-26,Vesey St & Church St,291,2.585222518945589
2022-10-26,30 St & 4 Ave,14,2.514696595302394
2022-10-26,Grand Concourse & E 196 St,19,2.4003992840148496
2022-10-26,Bushwick Ave & Harman St,31,2.3455489978637707
2022-10-26,Ryer Ave & E 182 St,4,3.127257206391871


## 2. METRO

In [0]:
%sql
CREATE OR REPLACE TABLE tfm.fact_metro_bi
SELECT
  fact.day AS date,
  dim.metro_station_name,
  dim.borough,
  fact.total_ridership
FROM tfm.fact_metro fact
LEFT JOIN tfm.dim_metro dim ON fact.metro_station_id = dim.metro_station_id

-- Objetivo: Crear una tabla que incluya los nombres de las estaciones de metro con el fin de realizar un análisis temporal del comportamiento de la demanda de usuarios. Dado que, como se ha observado previamente en el apartado de modelización, las variables climáticas no muestran una influencia significativa sobre la demanda del sistema de metro, dichos datos no se incorporan en esta tabla

num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM tfm.fact_metro_bi

date,metro_station_name,borough,total_ridership
2020-01-01,"Astoria-Ditmars Blvd (N,W)",Queens,5629.0
2020-01-01,"49 St (N,R,W)",Manhattan,15103.0
2020-01-01,"Hewes St (M,J)",Brooklyn,1738.0
2020-01-01,"Marcy Av (M,J,Z)",Brooklyn,7373.0
2020-01-01,"Bowery (J,Z)",Manhattan,3218.0
2020-01-01,"Broad St (J,Z)",Manhattan,2079.0
2020-01-01,Middle Village-Metropolitan Av (M),Queens,850.0
2020-01-01,Fresh Pond Rd (M),Queens,2065.0
2020-01-01,Forest Av (M),Queens,1572.0
2020-01-01,Seneca Av (M),Queens,1559.0


## 3. AUTOBUS

In [0]:
%sql

CREATE OR REPLACE TABLE tfm.fact_bus_bi SELECT * FROM tfm.fact_bus

num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM tfm.fact_bus_bi

day,bus_route,total_ridership
2020-01-01,B1,6252
2020-01-01,B100,751
2020-01-01,B103,3934
2020-01-01,B11,4123
2020-01-01,B12,3641
2020-01-01,B13,1563
2020-01-01,B14,1452
2020-01-01,B15,7402
2020-01-01,B16,2148
2020-01-01,B17,2886


## 4. CALENDARIO

In [0]:
%sql

CREATE OR REPLACE TABLE tfm.calendar_bi 
SELECT  
  date,
  is_holiday,
  is_pre_holiday,
  is_weekend
FROM tfm.dim_calendar


-- Objetivo: Crear la misma tabla calendario utilizada en la modelización para ser usada ahora en PowerBI

num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM tfm.calendar_bi

date,is_holiday,is_pre_holiday,is_weekend
2020-01-01,1,0,0
2020-01-02,0,0,0
2020-01-03,0,0,0
2020-01-04,0,0,1
2020-01-05,0,0,1
2020-01-06,0,0,0
2020-01-07,0,0,0
2020-01-08,0,0,0
2020-01-09,0,0,0
2020-01-10,0,0,0


## 5. GENERANDO ARCHIVOS PARA DOWNLOAD

**5.1 Generar archivos**

In [0]:
spark.table("tfm.fact_bike_bi") \
  .write \
  .mode("overwrite") \
  .parquet("dbfs:/FileStore/fact_bike_bi_parquet")

In [0]:
#fact_bike_bi

from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import StringType

df = spark.table("tfm.fact_bike_bi")

df_ptbr = df.select([
    regexp_replace(col(c).cast(StringType()), "\\.", ",").alias(c)
    if dict(df.dtypes)[c] in ["double", "float", "decimal"]
    else col(c)
    for c in df.columns
])

df_ptbr.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", "true") \
  .option("delimiter", ";") \
  .csv("dbfs:/FileStore/fact_bike_bi_csv")

In [0]:
#fact_bike_station_bi

from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import StringType

df = spark.table("tfm.fact_bike_station_bi")

df_ptbr = df.select([
    regexp_replace(col(c).cast(StringType()), "\\.", ",").alias(c)
    if dict(df.dtypes)[c] in ["double", "float", "decimal"]
    else col(c)
    for c in df.columns
])

df_ptbr.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", "true") \
  .option("delimiter", ";") \
  .csv("/FileStore/fact_bike_station_bi_csv")



In [0]:
#fact_metro_bi

from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import StringType

df = spark.table("tfm.fact_metro_bi")

df_ptbr = df.select([
    regexp_replace(col(c).cast(StringType()), "\\.", ",").alias(c)
    if dict(df.dtypes)[c] in ["double", "float", "decimal"]
    else col(c)
    for c in df.columns
])

df_ptbr.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", "true") \
  .option("delimiter", ";") \
  .csv("/FileStore/fact_metro_bi_csv")

In [0]:
#fact_bus_bi

from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import StringType

df = spark.table("tfm.fact_bus_bi")

df_ptbr = df.select([
    regexp_replace(col(c).cast(StringType()), "\\.", ",").alias(c)
    if dict(df.dtypes)[c] in ["double", "float", "decimal"]
    else col(c)
    for c in df.columns
])

df_ptbr.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", "true") \
  .option("delimiter", ";") \
  .csv("/FileStore/fact_bus_bi_csv")


In [0]:
#calendar_bi

from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import StringType

df = spark.table("tfm.calendar_bi")

df_ptbr = df.select([
    regexp_replace(col(c).cast(StringType()), "\\.", ",").alias(c)
    if dict(df.dtypes)[c] in ["double", "float", "decimal"]
    else col(c)
    for c in df.columns
])

df_ptbr.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", "true") \
  .option("delimiter", ";") \
  .csv("/FileStore/calendar_bi_csv")

**5.2 Renomear csv**

1) fact_bike_bi

In [0]:
dbutils.fs.ls("/FileStore/fact_bike_bi_csv")

[FileInfo(path='dbfs:/FileStore/fact_bike_bi_csv/_committed_1492264521345489118', name='_committed_1492264521345489118', size=212, modificationTime=1769167375000),
 FileInfo(path='dbfs:/FileStore/fact_bike_bi_csv/_committed_1896611525517452445', name='_committed_1896611525517452445', size=113, modificationTime=1769169635000),
 FileInfo(path='dbfs:/FileStore/fact_bike_bi_csv/_committed_2435150842944057280', name='_committed_2435150842944057280', size=201, modificationTime=1769167378000),
 FileInfo(path='dbfs:/FileStore/fact_bike_bi_csv/_committed_2612321440962858018', name='_committed_2612321440962858018', size=201, modificationTime=1769167383000),
 FileInfo(path='dbfs:/FileStore/fact_bike_bi_csv/_committed_3075721120462998088', name='_committed_3075721120462998088', size=201, modificationTime=1769167377000),
 FileInfo(path='dbfs:/FileStore/fact_bike_bi_csv/_committed_7975576087702896406', name='_committed_7975576087702896406', size=113, modificationTime=1769262028000),
 FileInfo(path='

In [0]:
dbutils.fs.mv(
  'dbfs:/FileStore/fact_bike_bi_csv/part-00000-tid-7975576087702896406-1b10d1eb-a88b-43e6-ab2a-7134c37595ed-107-1-c000.csv',
  "dbfs:/FileStore/fact_bike_bi.csv"
)

True

Ruta para descargaser la tabla fact_bike_bi:

https://adb-312340046462958.18.azuredatabricks.net/files/fact_bike_bi.csv


2) fact_bike_station_bi

In [0]:
dbutils.fs.ls("/FileStore/fact_bike_station_bi_csv")

[FileInfo(path='dbfs:/FileStore/fact_bike_station_bi_csv/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1769170272000),
 FileInfo(path='dbfs:/FileStore/fact_bike_station_bi_csv/_committed_4995226002666872585', name='_committed_4995226002666872585', size=113, modificationTime=1769167515000),
 FileInfo(path='dbfs:/FileStore/fact_bike_station_bi_csv/_committed_5893947591247601326', name='_committed_5893947591247601326', size=123, modificationTime=1769170272000),
 FileInfo(path='dbfs:/FileStore/fact_bike_station_bi_csv/_committed_vacuum1405265905625558248', name='_committed_vacuum1405265905625558248', size=96, modificationTime=1769170272000),
 FileInfo(path='dbfs:/FileStore/fact_bike_station_bi_csv/_started_5893947591247601326', name='_started_5893947591247601326', size=0, modificationTime=1769170260000),
 FileInfo(path='dbfs:/FileStore/fact_bike_station_bi_csv/part-00000-tid-5893947591247601326-7b1e16eb-ea47-4a48-a161-2beb5eab4585-316-1-c000.csv', name='part-00000-tid-58939475912476

In [0]:
dbutils.fs.mv(
  'dbfs:/FileStore/fact_bike_station_bi_csv/part-00000-tid-5893947591247601326-7b1e16eb-ea47-4a48-a161-2beb5eab4585-316-1-c000.csv',
  "dbfs:/FileStore/fact_bike_station_bi.csv"
)

True

Ruta para descargarse la tabla fact_station_bi:
    
https://adb-312340046462958.18.azuredatabricks.net/files/fact_bike_station_bi.csv

3) fact_metro_bi

In [0]:
dbutils.fs.ls("/FileStore/fact_metro_bi_csv")

[FileInfo(path='dbfs:/FileStore/fact_metro_bi_csv/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1769170356000),
 FileInfo(path='dbfs:/FileStore/fact_metro_bi_csv/_committed_2614356725398274160', name='_committed_2614356725398274160', size=123, modificationTime=1769170356000),
 FileInfo(path='dbfs:/FileStore/fact_metro_bi_csv/_committed_7569864101295097704', name='_committed_7569864101295097704', size=113, modificationTime=1769167518000),
 FileInfo(path='dbfs:/FileStore/fact_metro_bi_csv/_committed_vacuum1189360264552556752', name='_committed_vacuum1189360264552556752', size=96, modificationTime=1769170356000),
 FileInfo(path='dbfs:/FileStore/fact_metro_bi_csv/_started_2614356725398274160', name='_started_2614356725398274160', size=0, modificationTime=1769170353000),
 FileInfo(path='dbfs:/FileStore/fact_metro_bi_csv/part-00000-tid-2614356725398274160-dcb63fb2-9899-4345-b57c-60415db22df4-317-1-c000.csv', name='part-00000-tid-2614356725398274160-dcb63fb2-9899-4345-b57c-60415db22df4

In [0]:
dbutils.fs.mv(
  'dbfs:/FileStore/fact_metro_bi_csv/part-00000-tid-2614356725398274160-dcb63fb2-9899-4345-b57c-60415db22df4-317-1-c000.csv',
  "dbfs:/FileStore/fact_metro_bi.csv"
)

True

Ruta para descargarse la tabla fact_metro_bi:

https://adb-312340046462958.18.azuredatabricks.net/files/fact_metro_bi.csv

4. fact_bus_bi

In [0]:
dbutils.fs.ls("/FileStore/fact_bus_bi_csv")

[FileInfo(path='dbfs:/FileStore/fact_bus_bi_csv/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1769170395000),
 FileInfo(path='dbfs:/FileStore/fact_bus_bi_csv/_committed_352511024607769114', name='_committed_352511024607769114', size=122, modificationTime=1769170395000),
 FileInfo(path='dbfs:/FileStore/fact_bus_bi_csv/_committed_6331237353924234624', name='_committed_6331237353924234624', size=113, modificationTime=1769167520000),
 FileInfo(path='dbfs:/FileStore/fact_bus_bi_csv/_committed_vacuum8291210836450602433', name='_committed_vacuum8291210836450602433', size=96, modificationTime=1769170396000),
 FileInfo(path='dbfs:/FileStore/fact_bus_bi_csv/_started_352511024607769114', name='_started_352511024607769114', size=0, modificationTime=1769170394000),
 FileInfo(path='dbfs:/FileStore/fact_bus_bi_csv/part-00000-tid-352511024607769114-fe71658d-5c11-4d36-9789-dfab0768a611-318-1-c000.csv', name='part-00000-tid-352511024607769114-fe71658d-5c11-4d36-9789-dfab0768a611-318-1-c000.csv', 

In [0]:
dbutils.fs.mv(
  'dbfs:/FileStore/fact_bus_bi_csv/part-00000-tid-352511024607769114-fe71658d-5c11-4d36-9789-dfab0768a611-318-1-c000.csv',
  "dbfs:/FileStore/fact_bus_bi.csv"
)

True

Ruta para descargarse la tabla fact_bus_bi:
    
https://adb-312340046462958.18.azuredatabricks.net/files/fact_bus_bi.csv

5. Calendar

In [0]:
dbutils.fs.ls("/FileStore/calendar_bi_csv")

[FileInfo(path='dbfs:/FileStore/calendar_bi_csv/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1769170429000),
 FileInfo(path='dbfs:/FileStore/calendar_bi_csv/_committed_1729968780118233917', name='_committed_1729968780118233917', size=113, modificationTime=1769167521000),
 FileInfo(path='dbfs:/FileStore/calendar_bi_csv/_committed_7250859941875114159', name='_committed_7250859941875114159', size=123, modificationTime=1769170429000),
 FileInfo(path='dbfs:/FileStore/calendar_bi_csv/_committed_vacuum6333415130086096811', name='_committed_vacuum6333415130086096811', size=96, modificationTime=1769170430000),
 FileInfo(path='dbfs:/FileStore/calendar_bi_csv/_started_7250859941875114159', name='_started_7250859941875114159', size=0, modificationTime=1769170429000),
 FileInfo(path='dbfs:/FileStore/calendar_bi_csv/part-00000-tid-7250859941875114159-b6cbc3d4-2b31-4ead-b7d1-76c251d4c6c6-319-1-c000.csv', name='part-00000-tid-7250859941875114159-b6cbc3d4-2b31-4ead-b7d1-76c251d4c6c6-319-1-c000.

In [0]:
dbutils.fs.mv(
 'dbfs:/FileStore/calendar_bi_csv/part-00000-tid-7250859941875114159-b6cbc3d4-2b31-4ead-b7d1-76c251d4c6c6-319-1-c000.csv',
  "dbfs:/FileStore/calendar_bi.csv"
)

True

Ruta para descargarse la tabla calendar_bi:

https://adb-312340046462958.18.azuredatabricks.net/files/calendar_bi.csv