# Task 3.1. EXTRACT data from those raw CSV files in Bronze data store

Predefined parameters:

In [0]:
nrows_display = 20

## 1. Payment table

- (Fact) Payment:
    + `payment_id` : surrogate primary key of the payment, int format.
    + `date` : date of the payment, date format of `yyyy-mm-dd`.
    + `amount` : payment amount, float format.
    + `rider_id` : secondary key, representing ID of rider, int format.

In [0]:
payment_delta = spark.read.format("csv")            \
    .option("inferSchema", "false")                 \
    .option("header", "false")                      \
    .option("sep", ",")                             \
    .load("dbfs:/FileStore/tables/payments.csv")    \
    .toDF(                                          \
        "payment_id",                               \
        "date",                                     \
        "amount",                                   \
        "rider"                                     \
    )

Upon finish, review the schema.

In [0]:
payment_delta.printSchema()

root
 |-- payment_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- rider: string (nullable = true)



Also, get some head rows to verify the data.

In [0]:
display(payment_delta.limit(nrows_display))

payment_id,date,amount,rider
1,2019-05-01,9.0,1000
2,2019-06-01,9.0,1000
3,2019-07-01,9.0,1000
4,2019-08-01,9.0,1000
5,2019-09-01,9.0,1000
6,2019-10-01,9.0,1000
7,2019-11-01,9.0,1000
8,2019-12-01,9.0,1000
9,2020-01-01,9.0,1000
10,2020-02-01,9.0,1000


Finally, write data to Delta file location.

In [0]:
payment_delta.write.format("delta")         \
    .mode("overwrite")                      \
    .option("overwriteSchema", "true")      \
    .save("/delta/payments")

Then do the same with all other 4 tables.

## 2. Trip table

In [0]:
trip_delta = spark.read.format("csv")               \
    .option("inferSchema", "false")                 \
    .option("header", "false")                      \
    .option("sep", ",")                             \
    .load("dbfs:/FileStore/tables/trips.csv")       \
    .toDF(                                          \
        "trip_id",                                  \
        "rideable_type",                            \
        "started_at",                               \
        "ended_at",                                 \
        "start_station_id",                         \
        "end_station_id",                           \
        "rider_id"                                  \
    )

In [0]:
trip_delta.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- ended_at: string (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- rider_id: string (nullable = true)



In [0]:
display(trip_delta.limit(nrows_display))

trip_id,rideable_type,started_at,ended_at,start_station_id,end_station_id,rider_id
89E7AA6C29227EFF,classic_bike,2021-02-12 16:14:56,2021-02-12 16:21:43,525,660,71934
0FEFDE2603568365,classic_bike,2021-02-14 17:52:38,2021-02-14 18:12:09,525,16806,47854
E6159D746B2DBB91,electric_bike,2021-02-09 19:10:18,2021-02-09 19:19:10,KA1503000012,TA1305000029,70870
B32D3199F1C2E75B,classic_bike,2021-02-02 17:49:41,2021-02-02 17:54:06,637,TA1305000034,58974
83E463F23575F4BF,electric_bike,2021-02-23 15:07:23,2021-02-23 15:22:37,13216,TA1309000055,39608
BDAA7E3494E8D545,electric_bike,2021-02-24 15:43:33,2021-02-24 15:49:05,18003,KP1705001026,36267
A772742351171257,classic_bike,2021-02-01 17:47:42,2021-02-01 17:48:33,KP1705001026,KP1705001026,50104
295476889D9B79F8,classic_bike,2021-02-11 18:33:53,2021-02-11 18:35:09,18003,18003,19618
362087194BA4CC9A,classic_bike,2021-02-27 15:13:39,2021-02-27 15:36:36,KP1705001026,KP1705001026,16732
21630F715038CCB0,classic_bike,2021-02-20 08:59:42,2021-02-20 09:17:04,KP1705001026,KP1705001026,57068


In [0]:
trip_delta.write.format("delta")            \
    .mode("overwrite")                      \
    .option("overwriteSchema", "true")      \
    .save("/delta/trips")

## 3. Rider table

In [0]:
rider_delta = spark.read.format("csv")              \
    .option("inferSchema", "false")                 \
    .option("header", "false")                      \
    .option("sep", ",")                             \
    .load("dbfs:/FileStore/tables/riders.csv")      \
    .toDF(                                          \
        "rider_id",                                 \
        "first",                                    \
        "last",                                     \
        "address",                                  \
        "birthday",                                 \
        "account_start_date",                       \
        "account_end_date",                         \
        "is_member"                                 \
    )

In [0]:
rider_delta.printSchema()

root
 |-- rider_id: string (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- account_start_date: string (nullable = true)
 |-- account_end_date: string (nullable = true)
 |-- is_member: string (nullable = true)



In [0]:
display(rider_delta.limit(nrows_display))

rider_id,first,last,address,birthday,account_start_date,account_end_date,is_member
1000,Diana,Clark,1200 Alyssa Squares,1989-02-13,2019-04-23,,True
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True
1005,Christine,Rodriguez,224 Washington Mills Apt. 467,1974-08-27,2020-03-24,,False
1006,Alicia,Taylor,1137 Angela Locks,2004-01-30,2020-11-27,2021-12-01,True
1007,Benjamin,Fernandez,979 Phillips Ways,1988-01-11,2016-12-11,,False
1008,John,Crawford,7691 Evans Court,1987-02-21,2021-03-28,2021-07-01,True
1009,Victoria,Ritter,9922 Jim Crest Apt. 319,1981-02-07,2020-06-12,2021-11-01,True


In [0]:
rider_delta.write.format("delta")           \
    .mode("overwrite")                      \
    .option("overwriteSchema", "true")      \
    .save("/delta/riders")

## 4. Station table

In [0]:
station_delta = spark.read.format("csv")            \
    .option("inferSchema", "false")                 \
    .option("header", "false")                      \
    .option("sep", ",")                             \
    .load("dbfs:/FileStore/tables/stations.csv")    \
    .toDF(                                          \
        "station_id",                               \
        "name",                                     \
        "latitude",                                 \
        "longitude"                                 \
    )

In [0]:
station_delta.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [0]:
display(station_delta.limit(nrows_display))

station_id,name,latitude,longitude
525,Glenwood Ave & Touhy Ave,42.012701,-87.66605799999999
KA1503000012,Clark St & Lake St,41.88579466666667,-87.63110066666668
637,Wood St & Chicago Ave,41.895634,-87.672069
13216,State St & 33rd St,41.8347335,-87.6258275
18003,Fairbanks St & Superior St,41.89580766666667,-87.62025316666669
KP1705001026,LaSalle Dr & Huron St,41.894877,-87.632326
13253,Lincoln Ave & Waveland Ave,41.948797,-87.675278
KA1503000044,Rush St & Hubbard St,41.890173,-87.62618499999999
KA1504000140,Winchester Ave & Elston Ave,41.92403733333333,-87.67641483333334
TA1305000032,Clinton St & Madison St,41.882242,-87.64106600000001


In [0]:
station_delta.write.format("delta")         \
    .mode("overwrite")                      \
    .option("overwriteSchema", "true")      \
    .save("/delta/stations")

## 5. Date table

In [0]:
date_delta = spark.read.format("csv")               \
    .option("inferSchema", "false")                 \
    .option("header", "false")                      \
    .option("sep", ",")                             \
    .load("dbfs:/FileStore/tables/dates.csv")       \
    .toDF(                                          \
        "date_id",                                  \
        "day",                                      \
        "month",                                    \
        "quarter",                                  \
        "year",                                     \
        "day_of_week",                              \
        "day_of_year"                               \
    )

In [0]:
date_delta.printSchema()

root
 |-- date_id: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- quarter: string (nullable = true)
 |-- year: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- day_of_year: string (nullable = true)



In [0]:
display(date_delta.limit(nrows_display))

date_id,day,month,quarter,year,day_of_week,day_of_year
2013-01-01,1,1,1,2013,3,1
2013-01-02,2,1,1,2013,4,2
2013-01-03,3,1,1,2013,5,3
2013-01-04,4,1,1,2013,6,4
2013-01-05,5,1,1,2013,7,5
2013-01-06,6,1,1,2013,1,6
2013-01-07,7,1,1,2013,2,7
2013-01-08,8,1,1,2013,3,8
2013-01-09,9,1,1,2013,4,9
2013-01-10,10,1,1,2013,5,10


In [0]:
date_delta.write.format("delta")            \
    .mode("overwrite")                      \
    .option("overwriteSchema", "true")      \
    .save("/delta/dates")