<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/examples/example_4_using_dataproc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up PySpark

In [None]:
%pip install pyspark



## Context
- Message events are coming from platform message broker (kafka, pubsub, kinesis...)
- You need to process the data according to the requirements

## Challenge 1 (Streaming)
Step 1:
- Change writeStream to partition data by date column
- Change location to /content/lake/bronze/messages/data
- Add checkpoint (/content/lake/bronze/messages/checkpoint)
- Delete /content/lake/bronze/messages and reprocess data

Step 2:
- Implement new stream job to read from messages
- Identify corrupted data and write into another location as PARQUET
  - logic: event_status is null, empty or equal to "NONE"
  - location: /content/lake/bronze/messages_corrupted

------------------

## Challenge 2 (Streaming)
- Business reporting
- Aggregate events by event_status & date

### Technical requirements
- Implement writeStreaming job to write output as PARQUET
  - location: /content/lake/gold/events_daily
  - Partition data by date
  - Write into gold layer

-------------------

## Challenge 3 (Reporting / Batching)
- Implement reporting to identify anomalies



In [None]:
timestamp
id
message_type (OPEN, RECEIVED, SENT, CREATED)
message_id
user

In [1]:
%pip install faker

Collecting faker
  Downloading Faker-33.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.1.0


In [57]:
!ls content/output/messages/date=2024-11-30 | wc

    151     151   10268


In [95]:
!rm -rf content/output/

In [None]:
50 users
100 messages

# Producer

In [77]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Test streaming').getOrCreate()
sc = spark.sparkContext

def enrich_data(df):
  fake = Faker()
  messages = [fake.uuid4() for _ in range(99)]
  new_columns = {
      'event_type': F.lit(fake.random_element(elements=('OPEN', 'RECEIVED', 'SENT', 'CREATED', 'CLICKED', '', 'NONE'))),
      'message_id': F.lit(fake.random_element(elements=messages)),
      'channel': F.lit(fake.random_element(elements=('CHAT', 'EMAIL', 'SMS', 'PUSH', 'OTHER'))),
      'country_id': F.lit(fake.random_int(min=2000, max=2015)),
      'user_id': F.lit(fake.random_int(min=1000, max=1050)),
  }
  df = df.withColumns(new_columns)
  return df

def insert_messages(df: DataFrame, batch_id):
  enrich = enrich_data(df)
  enrich.write.mode("append").partitionBy("date").format("parquet").save("content/lake/bronze/messages")

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

df_transformed = df_stream.withColumn("date", F.to_date(F.col("timestamp")))

# write stream
query = (df_transformed.writeStream
.outputMode('append')
.trigger(processingTime='1 seconds')
.foreachBatch(insert_messages)
.start()
)

query.awaitTermination(60)


False

In [78]:
query.stop()

In [87]:
df = spark.read.format("parquet").load("content/lake/bronze/messages/*")
df.where("value = 59").show()

+--------------------+-----+----------+--------------------+-------+----------+-------+
|           timestamp|value|event_type|          message_id|channel|country_id|user_id|
+--------------------+-----+----------+--------------------+-------+----------+-------+
|2024-11-30 01:12:...|   59|      NONE|55a4fa9e-b3f0-43e...|  EMAIL|      2008|   1012|
+--------------------+-----+----------+--------------------+-------+----------+-------+



# Streaming Messages Corrupted

In [89]:
from pyspark.sql.types import *

def insert_messages_corrupted(df: DataFrame, batch_id):
  df.write.mode("append").partitionBy("date").format("parquet").save("content/lake/silver/messages_corrupted")

schema = StructType([StructField('timestamp', TimestampType(), True), StructField('value', LongType(), True), StructField('event_type', StringType(), True), StructField('message_id', StringType(), True), StructField('channel', StringType(), True), StructField('country_id', IntegerType(), True), StructField('user_id', IntegerType(), True), StructField('date', DateType(), True)])
# read stream
df_stream = spark.readStream.format("parquet").schema(schema).load("content/output/messages/*")

df_corrupted = df_stream.filter(F.col('event_type').isin('NONE', '') | F.col('event_type').isNull())

# write stream
query = (df_corrupted.writeStream
.outputMode('append')
.trigger(processingTime='5 seconds')
.foreachBatch(insert_messages_corrupted)
.start()
)

query.awaitTermination(20)

In [90]:
query.stop()

In [94]:
df = spark.read.format("parquet").load("content/output/messages_corrupted")
df.show()

+--------------------+-----+----------+--------------------+-------+----------+-------+----+
|           timestamp|value|event_type|          message_id|channel|country_id|user_id|date|
+--------------------+-----+----------+--------------------+-------+----------+-------+----+
|2024-11-30 01:12:...|   59|      NONE|55a4fa9e-b3f0-43e...|  EMAIL|      2008|   1012|NULL|
|2024-11-30 01:11:...|   10|      NONE|8a0b611a-ab58-4e5...|  EMAIL|      2010|   1025|NULL|
|2024-11-30 01:11:...|   12|      NONE|915e337d-a631-442...|  OTHER|      2004|   1024|NULL|
|2024-11-30 01:12:...|   37|      NONE|a3437da1-d163-47c...|  OTHER|      2010|   1048|NULL|
|2024-11-30 01:11:...|    1|      NONE|0bb83df9-fc91-4e4...|   PUSH|      2004|   1037|NULL|
|2024-11-30 01:11:...|    7|      NONE|424d4cc4-992f-451...|   PUSH|      2011|   1009|NULL|
|2024-11-30 01:11:...|   26|      NONE|1addf7a6-cc13-4b1...|   PUSH|      2008|   1004|NULL|
|2024-11-30 01:11:...|   31|      NONE|88adfa8c-2060-458...|   PUSH|  