## Read Clicks Kinesis streams and produce the Bronze and Silver medallion tables

In [0]:
%sql USE sparsha_aws_cdl;

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

clicks_schema= StructType() \
          .add("timestamp", StringType()) \
          .add("clickTimestamp", TimestampType()) \
          .add("txId", StringType()) \
          .add ("userSessionId", StringType()) \
          .add ("teamId", StringType()) \
          .add("userId", StringType()) \
          .add ("adId", StringType()) \
          .add("adCategory", StringType())

clicks = spark \
  .readStream \
  .format("kinesis") \
  .option("streamName", "sparsha-aws-cdl-clicks") \
  .option("initialPosition", "latest") \
  .option("region", "us-east-1") \
  .load()

In [0]:
clicks_raw = clicks.selectExpr("cast (data as STRING) jsonData", "approximateArrivalTimestamp") \
                 .withColumn("approximateArrivalDate",to_date(col("approximateArrivalTimestamp")))
display(clicks_raw)

jsonData,approximateArrivalTimestamp,approximateArrivalDate
"{""timestamp"":""2016-06-11 01:15:12"",""clickTimestamp"":""2022-06-19 16:18:22.491"",""txId"":""29026"",""userSessionId"":""26897"",""teamId"":""90"",""userId"":""1972"",""adId"":""8"",""adCategory"":""hardware""}",2022-06-19T16:18:22.594+0000,2022-06-19
"{""timestamp"":""2016-06-14 19:30:42"",""clickTimestamp"":""2022-06-19 16:18:22.517"",""txId"":""36706"",""userSessionId"":""36348"",""teamId"":""120"",""userId"":""1210"",""adId"":""20"",""adCategory"":""clothing""}",2022-06-19T16:18:22.620+0000,2022-06-19
"{""timestamp"":""2016-06-11 05:10:08"",""clickTimestamp"":""2022-06-19 16:18:22.679"",""txId"":""29248"",""userSessionId"":""26654"",""teamId"":""57"",""userId"":""2221"",""adId"":""24"",""adCategory"":""clothing""}",2022-06-19T16:18:22.785+0000,2022-06-19
"{""timestamp"":""2016-06-03 20:52:22"",""clickTimestamp"":""2022-06-19 16:18:22.709"",""txId"":""17074"",""userSessionId"":""16113"",""teamId"":""72"",""userId"":""455"",""adId"":""10"",""adCategory"":""fashion""}",2022-06-19T16:18:22.812+0000,2022-06-19
"{""timestamp"":""2016-06-13 04:23:52"",""clickTimestamp"":""2022-06-19 16:18:22.85"",""txId"":""32874"",""userSessionId"":""26472"",""teamId"":""20"",""userId"":""1806"",""adId"":""7"",""adCategory"":""hardware""}",2022-06-19T16:18:22.953+0000,2022-06-19
"{""timestamp"":""2016-06-15 23:11:11"",""clickTimestamp"":""2022-06-19 16:18:22.876"",""txId"":""38827"",""userSessionId"":""34565"",""teamId"":""66"",""userId"":""426"",""adId"":""17"",""adCategory"":""games""}",2022-06-19T16:18:22.978+0000,2022-06-19
"{""timestamp"":""2016-06-15 22:34:40"",""clickTimestamp"":""2022-06-19 16:18:23.015"",""txId"":""38756"",""userSessionId"":""33803"",""teamId"":""156"",""userId"":""294"",""adId"":""7"",""adCategory"":""hardware""}",2022-06-19T16:18:23.118+0000,2022-06-19
"{""timestamp"":""2016-06-06 16:48:01"",""clickTimestamp"":""2022-06-19 16:18:23.042"",""txId"":""21715"",""userSessionId"":""20893"",""teamId"":""75"",""userId"":""1525"",""adId"":""22"",""adCategory"":""computers""}",2022-06-19T16:18:23.145+0000,2022-06-19
"{""timestamp"":""2016-05-30 03:14:47"",""clickTimestamp"":""2022-06-19 16:18:23.187"",""txId"":""10567"",""userSessionId"":""9785"",""teamId"":""32"",""userId"":""2336"",""adId"":""18"",""adCategory"":""games""}",2022-06-19T16:18:23.290+0000,2022-06-19
"{""timestamp"":""2016-06-10 08:39:00"",""clickTimestamp"":""2022-06-19 16:18:23.22"",""txId"":""28035"",""userSessionId"":""24349"",""teamId"":""129"",""userId"":""2340"",""adId"":""14"",""adCategory"":""fashion""}",2022-06-19T16:18:23.323+0000,2022-06-19


## Write Click data from Kinesis to Bronze delta table

Write to Bronze Table

In [0]:
clicks_raw.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation","s3://sparsha-aws-cdl/checkpoint-clicks-bronze") \
    .partitionBy("approximateArrivalDate") \
    .trigger(processingTime='30 seconds') \
    .start("s3://sparsha-aws-cdl/clicks-bronze")

Create Metastore Reference - Bronze Data

In [0]:
%sql
DROP TABLE IF EXISTS sparsha_aws_cdl.clicks_bronze;
CREATE TABLE sparsha_aws_cdl.clicks_bronze USING DELTA LOCATION "s3://sparsha-aws-cdl/clicks-bronze"

Safely query the Clicks Bronze Data

In [0]:
%sql
SELECT * FROM sparsha_aws_cdl.clicks_bronze
LIMIT 10;

jsonData,approximateArrivalTimestamp,approximateArrivalDate
"{""timestamp"":""2016-06-10 04:43:19"",""clickTimestamp"":""2022-06-19 16:26:15.026"",""txId"":""27773"",""userSessionId"":""26692"",""teamId"":""63"",""userId"":""212"",""adId"":""4"",""adCategory"":""games""}",2022-06-19T16:26:15.128+0000,2022-06-19
"{""timestamp"":""2016-06-09 00:09:21"",""clickTimestamp"":""2022-06-19 16:26:15.052"",""txId"":""25122"",""userSessionId"":""23264"",""teamId"":""111"",""userId"":""1016"",""adId"":""20"",""adCategory"":""clothing""}",2022-06-19T16:26:15.155+0000,2022-06-19
"{""timestamp"":""2016-06-03 12:40:36"",""clickTimestamp"":""2022-06-19 16:26:15.195"",""txId"":""16648"",""userSessionId"":""16232"",""teamId"":""99"",""userId"":""1"",""adId"":""9"",""adCategory"":""sports""}",2022-06-19T16:26:15.297+0000,2022-06-19
"{""timestamp"":""2016-06-09 04:44:14"",""clickTimestamp"":""2022-06-19 16:26:15.219"",""txId"":""25363"",""userSessionId"":""20798"",""teamId"":""63"",""userId"":""462"",""adId"":""14"",""adCategory"":""fashion""}",2022-06-19T16:26:15.321+0000,2022-06-19
"{""timestamp"":""2016-06-06 05:00:02"",""clickTimestamp"":""2022-06-19 16:26:14.9"",""txId"":""20404"",""userSessionId"":""15880"",""teamId"":""22"",""userId"":""76"",""adId"":""24"",""adCategory"":""clothing""}",2022-06-19T16:26:15.007+0000,2022-06-19
"{""timestamp"":""2016-06-07 23:20:40"",""clickTimestamp"":""2022-06-19 16:26:15.011"",""txId"":""23567"",""userSessionId"":""20757"",""teamId"":""54"",""userId"":""178"",""adId"":""15"",""adCategory"":""sports""}",2022-06-19T16:26:15.114+0000,2022-06-19
"{""timestamp"":""2016-06-13 17:53:58"",""clickTimestamp"":""2022-06-19 16:26:15.046"",""txId"":""33925"",""userSessionId"":""28998"",""teamId"":""119"",""userId"":""2356"",""adId"":""21"",""adCategory"":""movies""}",2022-06-19T16:26:15.149+0000,2022-06-19
"{""timestamp"":""2016-06-04 04:01:26"",""clickTimestamp"":""2022-06-19 16:26:15.059"",""txId"":""17524"",""userSessionId"":""16055"",""teamId"":""64"",""userId"":""1567"",""adId"":""14"",""adCategory"":""fashion""}",2022-06-19T16:26:15.161+0000,2022-06-19
"{""timestamp"":""2016-06-05 02:16:15"",""clickTimestamp"":""2022-06-19 16:26:15.064"",""txId"":""18787"",""userSessionId"":""15994"",""teamId"":""53"",""userId"":""670"",""adId"":""25"",""adCategory"":""computers""}",2022-06-19T16:26:15.167+0000,2022-06-19
"{""timestamp"":""2016-06-04 21:25:38"",""clickTimestamp"":""2022-06-19 16:26:15.07"",""txId"":""18493"",""userSessionId"":""15809"",""teamId"":""8"",""userId"":""1889"",""adId"":""14"",""adCategory"":""fashion""}",2022-06-19T16:26:15.173+0000,2022-06-19


In [0]:
%sql
SELECT COUNT(*) FROM sparsha_aws_cdl.clicks_bronze

count(1)
7851


Read Bronze Delta table as a stream, and parse schema

In [0]:
bronze_data = spark \
  .readStream \
  .format("delta") \
  .load("s3://sparsha-aws-cdl/clicks-bronze") \
  .select(from_json("jsonData", clicks_schema).alias("fields"), "approximateArrivalDate", "approximateArrivalTimestamp") \
  .select("fields.*","approximateArrivalDate","approximateArrivalTimestamp")

Perform some cleaning

In [0]:
#Example cleaning, lowercase browser and create date column
bronze_data_cleaned = bronze_data.withColumn("categoryCleaned",lower(col("adCategory"))) \
                                 .withColumn("clickDate",to_date(col("clickTimestamp")))

In [0]:
display(bronze_data_cleaned)

timestamp,clickTimestamp,txId,userSessionId,teamId,userId,adId,adCategory,approximateArrivalDate,approximateArrivalTimestamp,categoryCleaned,clickDate
2016-06-04 04:01:26,2022-06-19T16:26:54.503+0000,17524,16055,64,1567,14,fashion,2022-06-19,2022-06-19T16:26:54.605+0000,fashion,2022-06-19
2016-06-12 17:02:59,2022-06-19T16:26:54.527+0000,32072,28435,71,1451,6,movies,2022-06-19,2022-06-19T16:26:54.630+0000,movies,2022-06-19
2016-06-08 02:55:30,2022-06-19T16:26:54.665+0000,23755,23624,25,1333,8,hardware,2022-06-19,2022-06-19T16:26:54.768+0000,hardware,2022-06-19
2016-06-09 12:07:46,2022-06-19T16:26:54.690+0000,25906,20837,66,426,4,games,2022-06-19,2022-06-19T16:26:54.792+0000,games,2022-06-19
2016-06-01 15:01:05,2022-06-19T16:26:54.830+0000,13753,12392,22,76,23,fashion,2022-06-19,2022-06-19T16:26:54.933+0000,fashion,2022-06-19
2016-06-15 05:38:03,2022-06-19T16:26:54.859+0000,37492,36884,123,1634,29,automotive,2022-06-19,2022-06-19T16:26:54.962+0000,automotive,2022-06-19
2016-06-04 04:01:26,2022-06-19T16:26:54.457+0000,17524,16055,64,1567,14,fashion,2022-06-19,2022-06-19T16:26:54.591+0000,fashion,2022-06-19
2016-06-10 22:33:06,2022-06-19T16:26:54.521+0000,28866,28616,128,1111,17,games,2022-06-19,2022-06-19T16:26:54.624+0000,games,2022-06-19
2016-06-04 04:43:57,2022-06-19T16:26:54.533+0000,17591,17044,61,409,6,movies,2022-06-19,2022-06-19T16:26:54.636+0000,movies,2022-06-19
2016-06-16 07:14:52,2022-06-19T16:26:54.539+0000,39478,34163,2,624,21,movies,2022-06-19,2022-06-19T16:26:54.642+0000,movies,2022-06-19


## Write Click data from Bronze to Silver delta table

Write parsed + cleaned Clicks data to silver

In [0]:
bronze_data_cleaned.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation","s3://sparsha-aws-cdl/checkpoint-clicks-silver") \
    .partitionBy("clickDate") \
    .trigger(processingTime='30 seconds') \
    .start("s3://sparsha-aws-cdl/clicks-silver")

Create metastore reference - Silver Data

In [0]:
%sql
DROP TABLE IF EXISTS sparsha_aws_cdl.clicks_silver;
CREATE TABLE sparsha_aws_cdl.clicks_silver USING DELTA LOCATION "s3://sparsha-aws-cdl/clicks-silver"

In [0]:
%sql
SELECT * FROM sparsha_aws_cdl.clicks_silver
LIMIT 10;

timestamp,clickTimestamp,txId,userSessionId,teamId,userId,adId,adCategory,approximateArrivalDate,approximateArrivalTimestamp,categoryCleaned,clickDate
2016-06-04 04:01:26,2022-06-19T16:26:54.503+0000,17524,16055,64,1567,14,fashion,2022-06-19,2022-06-19T16:26:54.605+0000,fashion,2022-06-19
2016-06-12 17:02:59,2022-06-19T16:26:54.527+0000,32072,28435,71,1451,6,movies,2022-06-19,2022-06-19T16:26:54.630+0000,movies,2022-06-19
2016-06-08 02:55:30,2022-06-19T16:26:54.665+0000,23755,23624,25,1333,8,hardware,2022-06-19,2022-06-19T16:26:54.768+0000,hardware,2022-06-19
2016-06-09 12:07:46,2022-06-19T16:26:54.690+0000,25906,20837,66,426,4,games,2022-06-19,2022-06-19T16:26:54.792+0000,games,2022-06-19
2016-06-01 15:01:05,2022-06-19T16:26:54.830+0000,13753,12392,22,76,23,fashion,2022-06-19,2022-06-19T16:26:54.933+0000,fashion,2022-06-19
2016-06-15 05:38:03,2022-06-19T16:26:54.859+0000,37492,36884,123,1634,29,automotive,2022-06-19,2022-06-19T16:26:54.962+0000,automotive,2022-06-19
2016-06-04 04:01:26,2022-06-19T16:26:54.457+0000,17524,16055,64,1567,14,fashion,2022-06-19,2022-06-19T16:26:54.591+0000,fashion,2022-06-19
2016-06-10 22:33:06,2022-06-19T16:26:54.521+0000,28866,28616,128,1111,17,games,2022-06-19,2022-06-19T16:26:54.624+0000,games,2022-06-19
2016-06-04 04:43:57,2022-06-19T16:26:54.533+0000,17591,17044,61,409,6,movies,2022-06-19,2022-06-19T16:26:54.636+0000,movies,2022-06-19
2016-06-16 07:14:52,2022-06-19T16:26:54.539+0000,39478,34163,2,624,21,movies,2022-06-19,2022-06-19T16:26:54.642+0000,movies,2022-06-19


In [0]:
%sql
SELECT COUNT(*) FROM sparsha_aws_cdl.clicks_silver;

count(1)
40325


Read point-in-time Clicks_Silver into a DataFrame

In [0]:
clicks_silver = spark.read.format("delta").load("s3://sparsha-aws-cdl/clicks-silver/")

In [0]:
clicks_silver.count()

In [0]:
display(clicks_silver.take(5))

timestamp,clickTimestamp,txId,userSessionId,teamId,userId,adId,adCategory,approximateArrivalDate,approximateArrivalTimestamp,categoryCleaned,clickDate
2016-06-04 04:01:26,2022-06-19T16:26:54.503+0000,17524,16055,64,1567,14,fashion,2022-06-19,2022-06-19T16:26:54.605+0000,fashion,2022-06-19
2016-06-12 17:02:59,2022-06-19T16:26:54.527+0000,32072,28435,71,1451,6,movies,2022-06-19,2022-06-19T16:26:54.630+0000,movies,2022-06-19
2016-06-08 02:55:30,2022-06-19T16:26:54.665+0000,23755,23624,25,1333,8,hardware,2022-06-19,2022-06-19T16:26:54.768+0000,hardware,2022-06-19
2016-06-09 12:07:46,2022-06-19T16:26:54.690+0000,25906,20837,66,426,4,games,2022-06-19,2022-06-19T16:26:54.792+0000,games,2022-06-19
2016-06-01 15:01:05,2022-06-19T16:26:54.830+0000,13753,12392,22,76,23,fashion,2022-06-19,2022-06-19T16:26:54.933+0000,fashion,2022-06-19


## Implementing a Reliable Data Lake with Databricks Delta and the AWS Ecosystem

Time to run an AWS Glue Crawler and populate the uscitiesupdated_csv table

In [0]:
%sql
CREATE TABLE IF NOT EXISTS clicks_with_profile_GOLD
(timestamp STRING,
 clickTimestamp TIMESTAMP,
 txId STRING,
 adCategory STRING)
USING DELTA
LOCATION 's3://sparsha-aws-cdl/clicks_with_profile_GOLD';

In [0]:
%sql
INSERT INTO clicks_with_profile_GOLD
SELECT 
 timestamp,
 clickTimestamp,
 txId,
 adCategory
FROM clicks_silver;

num_affected_rows,num_inserted_rows
40325,40325


In [0]:
%sql
SELECT count(*) FROM clicks_with_profile_GOLD

count(1)
40325


In [0]:
%sql
SELECT * 
FROM clicks_with_profile_GOLD
WHERE adCategory = 'games'
LIMIT 10;

timestamp,clickTimestamp,txId,adCategory
2016-06-09 12:07:46,2022-06-19T16:26:54.690+0000,25906,games
2016-06-10 22:33:06,2022-06-19T16:26:54.521+0000,28866,games
2016-06-08 15:18:38,2022-06-19T16:26:54.684+0000,24557,games
2016-06-07 16:22:33,2022-06-19T16:26:55.042+0000,23117,games
2016-06-03 21:57:05,2022-06-19T16:26:55.527+0000,17148,games
2016-05-30 16:28:17,2022-06-19T16:26:55.795+0000,11186,games
2016-06-02 12:20:57,2022-06-19T16:26:56.019+0000,14841,games
2016-06-08 15:18:38,2022-06-19T16:26:56.750+0000,24557,games
2016-06-04 19:44:05,2022-06-19T16:26:56.923+0000,18413,games
2016-06-03 13:11:33,2022-06-19T16:26:57.229+0000,16684,games


## Amazon Athena to Delta Lake integration

Generate manifests of a Delta table using Databricks Runtime

In [0]:
%sql
GENERATE symlink_format_manifest FOR TABLE delta.`s3://sparsha-aws-cdl/clicks_with_profile_GOLD`

Configure Amazon Athena to read the generated manifests

In [0]:
%sql
DROP TABLE IF EXISTS sparsha_aws_cdl.clicks_with_profile_GOLD_athena;

In [0]:
%sql
CREATE EXTERNAL TABLE clicks_with_profile_GOLD_athena
(timestamp STRING,
 clickTimestamp STRING,
 txId STRING,
 adCategory STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://sparsha-aws-cdl/clicks_with_profile_GOLD/_symlink_format_manifest/'

In [0]:
%sql
ALTER TABLE delta.`s3://sparsha-aws-cdl/clicks_with_profile_GOLD/` SET TBLPROPERTIES(delta.compatibility.symlinkFormatManifest.enabled=true)

## Push the Gold data to Redshift

In [0]:
username = dbutils.secrets.get("oetrta", "oetrta-redshift-username")
password = dbutils.secrets.get("oetrta", "redshift-password")
redshift_endpoint = dbutils.secrets.get(scope = "oetrta", key = "oetrta-redshift-endpoint")
tempdir = dbutils.secrets.get(scope = "oetrta", key = "redshift-temp-dir")
iam_role = dbutils.secrets.get(scope = "oetrta", key = "redshift-iam-role")
redshift_database = dbutils.secrets.get(scope = "oetrta", key = "redshift-database")

In [0]:
clicks_df = spark.read.format("delta").options(header='true', inferSchema='true').load('s3://aws-cdl-devdays/clicks_with_profile_GOLD')

In [0]:
jdbcUrl = "jdbc:redshift://{}/dev?user={}&password={}".format(redshift_endpoint, username, password)
print(jdbcUrl)

In [0]:
table = "clicks_with_profile_GOLD_redshift"

In [0]:
(clicks_df.write 
  .format("com.databricks.spark.redshift") 
  .option("url", jdbcUrl)
  .option("dbtable", table) 
  .option("tempdir", tempdir) 
  .option("aws_iam_role", iam_role)
  .save())

In [0]:
read_df = spark.read \
  .format("com.databricks.spark.redshift") \
  .option("url", jdbcUrl) \
  .option("dbtable", table) \
  .option("tempdir", tempdir) \
  .option("aws_iam_role", iam_role)\
  .load()

display(read_df)