##AWS Cloud Data Lake demo Overview

Components and steps we will follow in today's demo:
<br />
<br />
1. AWS Glue as our Central Metastore
2. We will launch 1 Kinesis Stream ie. **User click stream**
3. Join an already existing user Profile Delta table registered in our Glue metastore 
4. We will execute a crawler job to pull in an S3 datasets into our AWS Glue metastore
5. The pipeline consists of a Data Lake medallion appproach
6. We will demonstrate the Full DML support of Delta Lake while curating the Data Lake.  
6. The curated GOLD dataset will be available to Athena and pushed to Redshift for later consumption
7. Finally, a QuickSight dashboard

<img src="https://drive.google.com/uc?id=1fZL3IhbB86bh5c6z54TvwytjEBTNa7Wo&authuser=recohut.data.001%40gmail.com&usp=drive_fs" height="800" width="1000"/>

## Clean Up and Ingest

In [0]:
%sql 
CREATE DATABASE IF NOT EXISTS sparsha_aws_cdl
LOCATION "s3://sparsha-aws-cdl/";

In [0]:
%sql
DESCRIBE DATABASE sparsha_aws_cdl;

database_description_item,database_description_value
Namespace Name,sparsha_aws_cdl
Comment,
Location,s3://sparsha-aws-cdl/
Owner,root


In [0]:
%sql
USE sparsha_aws_cdl;

In [0]:
%sql
SHOW TABLES;

database,tableName,isTemporary


In [0]:
%sql
DROP TABLE IF EXISTS clicks_bronze;
DROP TABLE IF EXISTS clicks_silver;
DROP TABLE IF EXISTS clicks_with_profile;
DROP TABLE IF EXISTS clicks_with_profile_GOLD;
DROP TABLE IF EXISTS clicks_with_profile_GOLD_athena;

In [0]:
display(dbutils.fs.ls("/databricks-datasets"))

path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,1655648398637
dbfs:/databricks-datasets/README.md,README.md,976,1532502324000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,1655648398637
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455505834000
dbfs:/databricks-datasets/adult/,adult/,0,1655648398637
dbfs:/databricks-datasets/airlines/,airlines/,0,1655648398637
dbfs:/databricks-datasets/amazon/,amazon/,0,1655648398637
dbfs:/databricks-datasets/asa/,asa/,0,1655648398637
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,1655648398637
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,1655648398637


In [0]:
%fs ls "s3://sparsha-aws-cdl"

In [0]:
#sparsha-aws-cdl S3 bucket working folders
dbutils.fs.rm("s3://sparsha-aws-cdl/clicks-bronze", True)
dbutils.fs.rm("s3://sparsha-aws-cdl/clicks-silver", True)
dbutils.fs.rm("s3://sparsha-aws-cdl/clicks-with-profile", True)
dbutils.fs.rm("s3://sparsha-aws-cdl/clicks_with_profile_GOLD", True)
dbutils.fs.rm("s3://sparsha-aws-cdl/checkpoint-clicks-bronze", True)
dbutils.fs.rm("s3://sparsha-aws-cdl/checkpoint-clicks-silver", True)

In [0]:
import pandas as pd

df = pd.read_csv("https://github.com/RecoHut-Datasets/ad-clicks/raw/main/ad-clicks.csv")
df = df.astype('str')
df.head()

Unnamed: 0,timestamp,txId,userSessionId,teamId,userId,adId,adCategory
8641,2016-06-09 13:10:47,25973,20699,36,1953,0,sports
4371,2016-06-04 04:01:26,17524,16055,64,1567,14,fashion
12937,2016-06-13 15:17:22,33770,26385,2,1337,25,computers
14088,2016-06-14 13:33:43,36216,34479,57,2221,23,fashion
11539,2016-06-12 09:35:22,31432,26905,93,417,0,sports


In [0]:
df = spark.createDataFrame(df)
display(df.show(5))

In [0]:
df.createOrReplaceTempView("clicks")
df.printSchema()

In [0]:
display(spark.sql("SELECT * FROM clicks LIMIT 10"))

timestamp,txId,userSessionId,teamId,userId,adId,adCategory
2016-06-09 13:10:47,25973,20699,36,1953,0,sports
2016-06-04 04:01:26,17524,16055,64,1567,14,fashion
2016-06-13 15:17:22,33770,26385,2,1337,25,computers
2016-06-14 13:33:43,36216,34479,57,2221,23,fashion
2016-06-12 09:35:22,31432,26905,93,417,0,sports
2016-05-28 16:50:52,8654,7541,11,1260,25,computers
2016-06-14 21:43:04,36906,34541,64,1157,21,movies
2016-06-10 16:40:49,28492,26681,61,1085,6,movies
2016-06-08 22:26:59,24980,20697,36,1001,12,computers
2016-06-08 06:07:48,23917,20977,90,1639,23,fashion


## Kinesis Stream Producer

<img src="https://drive.google.com/uc?id=1fZlcO7umZW5xtgcrkbGGSnKfX5IQeJ_E&authuser=recohut.data.001%40gmail.com&usp=drive_fs" height="600" width="800"/>

In [0]:
%scala
def get_kinesis_endpoint(): String = {
  "https://kinesis.us-east-1.amazonaws.com"
}

In [0]:
%scala
def get_partition_info(fullPath: String): Seq[com.databricks.backend.daemon.dbutils.FileInfo] = {
  // fullPath should be any DBFS mounted location 
  var input = fullPath
  // if fullPath ends with a "/" then we remove it to do some string adjustments 
  if ( fullPath.endsWith("/") ) { 
    input = fullPath.slice(0, fullPath.length - 1)
  }
  // Grab the last folder of the path 
  val endPath = input.split('/').last
  // Grab the prefix until the folder in question 
  val prefix = input.slice(0, input.length - endPath.length)
  // Filter and grab get the dbutils output for that given folder 
  val inputDir = dbutils.fs.ls(prefix).filter(x => x.name == endPath + "/")
  get_partition_helper(inputDir)
}

def get_partition_helper(paths: Seq[com.databricks.backend.daemon.dbutils.FileInfo]): Seq[com.databricks.backend.daemon.dbutils.FileInfo] = {
  val debug = false
  var ret = paths.filter(x => !x.name.startsWith("_"))
  var next = paths.flatMap(y => dbutils.fs.ls(y.path).filter(x => !x.name.startsWith("_")))
  if (debug) {
    ret.foreach(x => println(x.path))
  }
  if (next.filter(x => x.name contains ".parquet").length == 0) { 
    ret = get_partition_helper(paths.flatMap(y => dbutils.fs.ls(y.path).filter(x => !x.name.startsWith("_"))))
  } 
  ret 
}

In [0]:
%scala
import com.amazonaws.services.kinesis.model.PutRecordRequest
import com.amazonaws.services.kinesis.AmazonKinesisClient
import com.amazonaws.auth.{DefaultAWSCredentialsProviderChain, BasicAWSCredentials}
import java.nio.ByteBuffer
import scala.util.Random
import com.google.gson.Gson
import org.joda.time
import org.joda.time.format._
import java.sql.Timestamp;
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

In [0]:
%scala
// === Configurations for Kinesis streams ===

val kinesisStreamName = "sparsha-aws-cdl-clicks"
val kinesisEndpointUrl = get_kinesis_endpoint() // e.g. https://kinesis.us-east-1.amazonaws.com"

// === Configurations of amount of data to produce ===
val recordsPerSecond = 10
val wordsPerRecord = 10
val numSecondsToSend = 7200

In [0]:
%scala
// Create the low-level Kinesis Client from the AWS Java SDK.
// val kinesisClient = new AmazonKinesisClient(new BasicAWSCredentials(aws_kinesis_keys(0), aws_kinesis_keys(1)))
val kinesisClient = new AmazonKinesisClient() //using the IAM_Role

kinesisClient.setEndpoint(kinesisEndpointUrl)

println(s"Putting records onto stream $kinesisStreamName and endpoint $kinesisEndpointUrl at a rate of" +
  s" $recordsPerSecond records per second with $wordsPerRecord words per record for $numSecondsToSend seconds")

// Function to generate data

val impressions = spark.sql("select * from clicks").collect()

case class Record(timestamp: String, clickTimestamp: String, txId: String, userSessionId: String, teamId: String, userId: String, adId: String, adCategory: String)

def GsonTest() : String = {
    val imp = impressions(Random.nextInt(impressions.size)).toSeq
    val dt = new Timestamp(System.currentTimeMillis()-100)

    val r = Record(imp(0).asInstanceOf[String], dt.toString(), imp(1).asInstanceOf[String], imp(2).asInstanceOf[String], imp(3).asInstanceOf[String], imp(4).asInstanceOf[String], imp(5).asInstanceOf[String], imp(6).asInstanceOf[String])
    // create a JSON string from the Record, then print it
    val gson = new Gson
    val jsonString = gson.toJson(r)
    return jsonString
}

// Generate and send the data
for (round <- 1 to numSecondsToSend) {
  for (recordNum <- 1 to recordsPerSecond) {
    val data = GsonTest()
    println(data)
    val partitionKey = s"partitionKey-$recordNum"
    val putRecordRequest = new PutRecordRequest().withStreamName(kinesisStreamName)
        .withPartitionKey(partitionKey)
        .withData(ByteBuffer.wrap(data.getBytes()))
    kinesisClient.putRecord(putRecordRequest)
    println(putRecordRequest)
  }
  Thread.sleep(100) // Sleep for a second
  println(s"Sent $recordsPerSecond records with $wordsPerRecord words each")
}

println("\nTotal number of records sent")