# Extract data from DBFS into dataframes

In [None]:
from pyspark.sql.types import *

### Trips dataset

In [None]:

schema_trips = StructType([\
    StructField("trip_id", StringType(), False),\
    StructField("rideable_type", StringType(), True),\
    StructField("started_at", TimestampType(), True),\
    StructField("ended_at", TimestampType(), True),\
    StructField("start_station_id", StringType(), True),\
    StructField("end_station_id", StringType(), True),\
    StructField("member_id", IntegerType(), True)])
df_trips = spark.read.format("csv")\
                .option("inferSchema", "false")\
                .option("header", "false")\
                .option("sep", ",")\
                .schema(schema_trips)\
                .load("/FileStore/divvy/trips.csv")
df_trips.show(5)

+----------------+-------------+-------------------+-------------------+----------------+--------------+---------+
|         trip_id|rideable_type|         started_at|           ended_at|start_station_id|end_station_id|member_id|
+----------------+-------------+-------------------+-------------------+----------------+--------------+---------+
|89E7AA6C29227EFF| classic_bike|2021-02-12 16:14:56|2021-02-12 16:21:43|             525|           660|    71934|
|0FEFDE2603568365| classic_bike|2021-02-14 17:52:38|2021-02-14 18:12:09|             525|         16806|    47854|
|E6159D746B2DBB91|electric_bike|2021-02-09 19:10:18|2021-02-09 19:19:10|    KA1503000012|  TA1305000029|    70870|
|B32D3199F1C2E75B| classic_bike|2021-02-02 17:49:41|2021-02-02 17:54:06|             637|  TA1305000034|    58974|
|83E463F23575F4BF|electric_bike|2021-02-23 15:07:23|2021-02-23 15:22:37|           13216|  TA1309000055|    39608|
+----------------+-------------+-------------------+-------------------+--------

In [None]:
df_trips.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: timestamp (nullable = true)
 |-- ended_at: timestamp (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- member_id: integer (nullable = true)



### Payment dataset

In [None]:
schema_payment = StructType([\
    StructField("payment_id", IntegerType(), False),\
    StructField("date", TimestampType(), True),\
    StructField("amount", FloatType(), True),\
    StructField("account_number", IntegerType(), True)])
df_payment = spark.read.format("csv")\
                .option("inferSchema", "false")\
                .option("header", "false")\
                .option("sep", ",")\
                .schema(schema_payment)\
                .load("/FileStore/divvy/payments.csv")
df_payment.show(5)

+----------+-------------------+------+--------------+
|payment_id|               date|amount|account_number|
+----------+-------------------+------+--------------+
|         1|2019-05-01 00:00:00|   9.0|          1000|
|         2|2019-06-01 00:00:00|   9.0|          1000|
|         3|2019-07-01 00:00:00|   9.0|          1000|
|         4|2019-08-01 00:00:00|   9.0|          1000|
|         5|2019-09-01 00:00:00|   9.0|          1000|
+----------+-------------------+------+--------------+
only showing top 5 rows



In [None]:
df_payment.printSchema()

root
 |-- payment_id: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- amount: float (nullable = true)
 |-- account_number: integer (nullable = true)



### Rider dataset

In [None]:
schema_rider = StructType([\
    StructField("rider_id", IntegerType(), False),\
    StructField("first", StringType(), True),\
    StructField("last", StringType(), True),\
    StructField("address", StringType(), True),\
    StructField("birthday", TimestampType(), True),\
    StructField("start_date", TimestampType(), True),\
    StructField("end_date", TimestampType(), True),\
    StructField("member", BooleanType(), True)])
df_rider = spark.read.format("csv")\
                .option("inferSchema", "false")\
                .option("header", "false")\
                .option("sep", ",")\
                .schema(schema_rider)\
                .load("/FileStore/divvy/riders.csv")
df_rider.show(5)

+--------+--------+---------+--------------------+-------------------+-------------------+-------------------+------+
|rider_id|   first|     last|             address|           birthday|         start_date|           end_date|member|
+--------+--------+---------+--------------------+-------------------+-------------------+-------------------+------+
|    1000|   Diana|    Clark| 1200 Alyssa Squares|1989-02-13 00:00:00|2019-04-23 00:00:00|               NULL|  true|
|    1001|Jennifer|    Smith|     397 Diana Ferry|1976-08-10 00:00:00|2019-11-01 00:00:00|2020-09-01 00:00:00|  true|
|    1002|   Karen|    Smith|644 Brittany Row ...|1998-08-10 00:00:00|2022-02-04 00:00:00|               NULL|  true|
|    1003|   Bryan|  Roberts|996 Dickerson Tur...|1999-03-29 00:00:00|2019-08-26 00:00:00|               NULL| false|
|    1004|   Jesse|Middleton|7009 Nathan Expre...|1969-04-11 00:00:00|2019-09-14 00:00:00|               NULL|  true|
+--------+--------+---------+--------------------+------

In [None]:
df_rider.printSchema()

root
 |-- rider_id: integer (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: string (nullable = true)
 |-- birthday: timestamp (nullable = true)
 |-- start_date: timestamp (nullable = true)
 |-- end_date: timestamp (nullable = true)
 |-- member: boolean (nullable = true)



### Station dataset

In [None]:
schema_station = StructType([\
    StructField("station_id", StringType(), False),\
    StructField("name", StringType(), True),\
    StructField("latitude", FloatType(), True),\
    StructField("longitude", FloatType(), True)])
df_station = spark.read.format("csv")\
                .option("inferSchema", "false")\
                .option("header", "false")\
                .option("sep", ",")\
                .schema(schema_station)\
                .load("/FileStore/divvy/stations.csv")
df_station.show(5)

+------------+--------------------+---------+----------+
|  station_id|                name| latitude| longitude|
+------------+--------------------+---------+----------+
|         525|Glenwood Ave & To...|  42.0127| -87.66606|
|KA1503000012|  Clark St & Lake St|41.885796|  -87.6311|
|         637|Wood St & Chicago...|41.895634|-87.672066|
|       13216|  State St & 33rd St|41.834732|-87.625824|
|       18003|Fairbanks St & Su...| 41.89581|-87.620255|
+------------+--------------------+---------+----------+
only showing top 5 rows



In [None]:
df_station.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)



# Ingest dataframes into Delta

In [None]:
df_trips.write.format("delta")\
              .mode("overwrite")\
              .save("/delta/bronze_trips")

In [None]:
df_payment.write.format("delta")\
              .mode("overwrite")\
              .save("/delta/bronze_payments")

In [None]:
df_rider.write.format("delta")\
              .mode("overwrite")\
              .save("/delta/bronze_riders")

In [None]:
df_station.write.format("delta")\
              .mode("overwrite")\
              .save("/delta/bronze_stations")