<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/examples/example_4_using_dataproc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up PySpark

In [None]:
%pip install pyspark



## Context
- Message events are coming from platform message broker (kafka, pubsub, kinesis...)
- You need to process the data according to the requirements

## Challenge 1 (Streaming)
Step 1:
- Change writeStream (cell "Producer") to partition data by "date" column
  - "date" column must be created from "timestamp"
- Change parquet location to "/content/lake/bronze/messages/data"
- Add checkpoint (/content/lake/bronze/messages/checkpoint)
- Delete /content/lake/bronze/messages and reprocess data
  - For reprocessing, run the streaming for at least 1 minute

Step 2:
- Implement new stream job to read from messages (parquet)
- Identify corrupted data and write into another location as PARQUET
  - logic: event_status is null, empty or equal to "NONE"
  - location: /content/lake/bronze/messages_corrupted/data
  - checkpoint: /content/lake/bronze/messages_corrupted/checkpoint
  - use StructSchema
  - Set trigger interval to 10 seconds
- For reprocessing, run the streaming for at least 60 seconds

------------------

## Challenge 2 (Streaming)
- Business reporting
- Aggregate events by event_status & date

### Technical requirements
- Implement writeStreaming job to write output as PARQUET
  - location: /content/lake/gold/events_daily
  - Partition data by date
  - Write into gold layer

-------------------

## Challenge 3 (Reporting / Batching)
- Implement reporting to identify anomalies



In [None]:
timestamp
id
message_type (OPEN, RECEIVED, SENT, CREATED)
message_id
user

In [1]:
%pip install faker

Collecting faker
  Downloading Faker-33.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.1.0


In [None]:
!ls content/output/messages/date=2024-11-30 | wc

    151     151   10268


In [None]:
!rm -rf content/output/

In [None]:
50 users
100 messages

# Producer

In [2]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Test streaming').getOrCreate()
sc = spark.sparkContext

def enrich_data(df):
  fake = Faker()
  messages = [fake.uuid4() for _ in range(99)]
  new_columns = {
      'event_type': F.lit(fake.random_element(elements=('OPEN', 'RECEIVED', 'SENT', 'CREATED', 'CLICKED', '', 'NONE'))),
      'message_id': F.lit(fake.random_element(elements=messages)),
      'channel': F.lit(fake.random_element(elements=('CHAT', 'EMAIL', 'SMS', 'PUSH', 'OTHER'))),
      'country_id': F.lit(fake.random_int(min=2000, max=2015)),
      'user_id': F.lit(fake.random_int(min=1000, max=1050)),
  }
  df = df.withColumns(new_columns)
  return df

def insert_messages(df: DataFrame, batch_id):
  enrich = enrich_data(df)
  enrich.write.mode("append").partitionBy("date").format("parquet").save("content/lake/bronze/messages")

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

df_transformed = df_stream.withColumn("date", F.to_date(F.col("timestamp")))

# write stream
query = (df_transformed.writeStream
.outputMode('append')
.trigger(processingTime='1 seconds')
.foreachBatch(insert_messages)
.start()
)

query.awaitTermination(60)


False

In [4]:
query.stop()

ERROR:py4j.clientserver:There was an exception while executing the Python Proxy on the Python Side.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/sql/utils.py", line 120, in call
    raise e
  File "/usr/local/lib/python3.10/dist-packages/pyspark/sql/utils.py", line 117, in call
    self.func(DataFrame(jdf, wrapped_session_jdf), batch_id)
  File "<ipython-input-2-7e88871c2524>", line 24, in insert_messages
    enrich.write.mode("append").partitionBy("date").format("parquet").save("content/lake/bronze/messages")
  File "/usr/local/lib/python3.10/dist-packages/pyspark/sql/readwriter.py", line 1463, in save
    self._jwrite.save(path)
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1322, in __call__
    return_value = get_return_value(
  File "/usr/local/lib/py

In [5]:
df = spark.read.format("parquet").load("content/lake/bronze/messages/*")
df.show()

+--------------------+-----+----------+--------------------+-------+----------+-------+
|           timestamp|value|event_type|          message_id|channel|country_id|user_id|
+--------------------+-----+----------+--------------------+-------+----------+-------+
|2024-12-02 23:06:...|    0|          |fb3079f9-336d-439...|  OTHER|      2001|   1044|
|2024-12-02 23:06:...|    2|          |fb3079f9-336d-439...|  OTHER|      2001|   1044|
|2024-12-02 23:06:...|    4|          |fb3079f9-336d-439...|  OTHER|      2001|   1044|
|2024-12-02 23:06:...|    1|          |fb3079f9-336d-439...|  OTHER|      2001|   1044|
|2024-12-02 23:06:...|    3|          |fb3079f9-336d-439...|  OTHER|      2001|   1044|
|2024-12-02 23:06:...|    5|          |24436b0e-d45c-4ef...|    SMS|      2006|   1050|
|2024-12-02 23:06:...|    7|          |24436b0e-d45c-4ef...|    SMS|      2006|   1050|
|2024-12-02 23:08:...|  105|  RECEIVED|1328c8b4-1232-4f8...|  OTHER|      2000|   1042|
|2024-12-02 23:09:...|  140|  RE

# Streaming Messages Corrupted

In [8]:
from pyspark.sql.types import *

def insert_messages_corrupted(df: DataFrame, batch_id):
  df.write.mode("append").partitionBy("date").format("parquet").save("content/lake/silver/messages_corrupted")

schema = StructType([StructField('timestamp', TimestampType(), True), StructField('value', LongType(), True), StructField('event_type', StringType(), True), StructField('message_id', StringType(), True), StructField('channel', StringType(), True), StructField('country_id', IntegerType(), True), StructField('user_id', IntegerType(), True), StructField('date', DateType(), True)])
# read stream
df_stream = spark.readStream.format("parquet").schema(schema).load("content/lake/bronze/messages/*")

df_corrupted = df_stream.filter(F.col('event_type').isin('NONE', '') | F.col('event_type').isNull())

# write stream
query = (df_corrupted.writeStream
.outputMode('append')
.trigger(processingTime='5 seconds')
.foreachBatch(insert_messages_corrupted)
.start()
)

query.awaitTermination(20)

False

In [9]:
query.stop()

In [12]:
df = spark.read.format("parquet").load("content/lake/silver/messages_corrupted")
df.show()

+--------------------+-----+----------+--------------------+-------+----------+-------+----+
|           timestamp|value|event_type|          message_id|channel|country_id|user_id|date|
+--------------------+-----+----------+--------------------+-------+----------+-------+----+
|2024-12-02 23:07:...|   24|          |0b009667-bcf3-427...|  EMAIL|      2009|   1037|NULL|
|2024-12-02 23:07:...|   36|          |e3a66a94-2ff1-4cc...|  OTHER|      2006|   1013|NULL|
|2024-12-02 23:06:...|   10|          |67ca029c-ae87-43f...|  OTHER|      2011|   1033|NULL|
|2024-12-02 23:07:...|   41|          |fc6a6c21-25a4-4b4...|  EMAIL|      2014|   1041|NULL|
|2024-12-02 23:08:...|  130|          |cca8ab48-df16-45b...|  OTHER|      2001|   1000|NULL|
|2024-12-02 23:06:...|   13|          |7e16a974-29ee-404...|   CHAT|      2013|   1049|NULL|
|2024-12-02 23:07:...|   28|          |391c327c-0303-430...|   CHAT|      2015|   1020|NULL|
|2024-12-02 23:07:...|   49|          |d180b858-19fa-44a...|   PUSH|  