<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/examples/example_4_using_dataproc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up PySpark

In [None]:
%pip install pyspark



## Simulate producer:
- extract data from API
- store data as json in the lake
- run task async

In [None]:
import requests
from pyspark.sql.types import *
import json
import datetime
import asyncio

landing_path=f"gs://{bucket_name}/datalake/landing/{table_path}"

async def ingest_from_api(url: str, table: str, schema: StructType = None):
  response = requests.get(url)
  timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
  if response.status_code == 200:
    data = response.json()
    with open(f"{landing_path}_{int(timestamp)}.json", "w") as f:
        json.dump(data, f)

async def producer(loop: int, interval_time: int):
  for i in range(loop):
    await ingest_from_api("https://api.carrismetropolitana.pt/vehicles", "vehicles")
    await ingest_from_api("https://api.carrismetropolitana.pt/lines", "lines")
    await asyncio.sleep(interval_time)

async def main():
  asyncio.create_task(producer(10, 30))

await main()

- Read from /content/landing as streaming
- store data in memory (for testing)
- store data in the bronze layer

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('Test streaming').getOrCreate()
sc = spark.sparkContext

def insert_vehicles(df, batch_id, path):
  df.write.format("parquet").mode("append").save(path)

vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

# define paths
bucket_name="edit-data-eng-dev"
table_path="vehicles"
landing_path=f"gs://{bucket_name}/datalake/landing/{table_path}"
bronze_path=f"gs://{bucket_name}/datalake/bronze/{table_path}"

stream = spark.readStream.format("parquet").schema(schema).load(landing_path)

query = (stream
          .writeStream
          .outputMode("append")
          .foreachBatch(insert_vehicles(bronze_path))
          .option("checkpointLocation", "/content/bronze/checkpoint")
          .trigger(processingTime='20 seconds')
          .start()
          .awaitTermination(60)

In [1]:
!rm -rf /content/content

In [2]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Test streaming').getOrCreate()
sc = spark.sparkContext

def insert_into_table(df, batch_id):
  fake = Faker()
  new_columns = {
      'name': F.lit(fake.name()),
      'address': F.lit(fake.address()),
      'email': F.lit(fake.email()),
      'dob': F.lit(fake.date_of_birth()),
      'phone': F.lit(fake.phone_number())
  }
  df = df.withColumns(new_columns)
  df.write.mode("append").format("parquet").save("content/output/events")

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

# write stream
query = (df_stream.writeStream
.outputMode('append')
.trigger(processingTime='1 seconds')
.foreachBatch(insert_into_table)
.start()
.awaitTermination(20)
)



ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=41>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/sock

Py4JError: An error occurred while calling o35.awaitTermination

In [12]:
query

False

In [9]:
df = spark.read.format("parquet").load("content/output/events")

df.show()

+--------------------+-----+-----------------+--------------------+--------------------+----------+--------------------+
|           timestamp|value|             name|             address|               email|       dob|               phone|
+--------------------+-----+-----------------+--------------------+--------------------+----------+--------------------+
|2024-11-29 20:16:...|   29|     Tanya Thomas|66189 Gonzalez Pa...|edwardsjenna@exam...|1989-11-19| (822)723-3944x10441|
|2024-11-29 20:16:...|    0|       Gary Payne|151 Antonio Summi...|kathywalls@exampl...|2023-07-14|  781.561.1601x13391|
|2024-11-29 20:16:...|    2|       Gary Payne|151 Antonio Summi...|kathywalls@exampl...|2023-07-14|  781.561.1601x13391|
|2024-11-29 20:16:...|   16|     Ann Cummings|53919 William Cou...|michellepatrick@e...|1950-11-21|     +1-334-420-3370|
|2024-11-29 20:16:...|    4|   Michelle Myers|1526 Carter Traff...|sanderson@example...|1926-11-24|  (774)770-8398x0088|
|2024-11-29 20:16:...|   20|Jess

In [None]:
timestamp
id
message_type (OPEN, RECEIVED, SENT, CREATED)
message_id
user

In [None]:
query.stop

In [10]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Test streaming').getOrCreate()
sc = spark.sparkContext

def insert_into_table(df, batch_id):
  fake = Faker()
  new_columns = {
      'event_type': F.lit(fake.random_element(elements=('OPEN', 'RECEIVED', 'SENT', 'CREATED', 'CLICKED'))),
      'event_id': F.lit(fake.uuid4()),
      'country': F.lit(fake.country()),
      'user_id': F.lit(fake.random_int(min=1000, max=1050)),
  }
  df = df.withColumns(new_columns)
  df.write.mode("append").format("parquet").save("content/output/messages")

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

# write stream
query = (df_stream.writeStream
.outputMode('append')
.trigger(processingTime='1 seconds')
.foreachBatch(insert_into_table)
.start()
)



In [21]:
query.stop()

In [20]:
df = spark.read.format("parquet").load("content/output/messages")
df.show()

+--------------------+-----+----------+--------------------+--------------------+-------+
|           timestamp|value|event_type|            event_id|             country|user_id|
+--------------------+-----+----------+--------------------+--------------------+-------+
|2024-11-29 20:22:...|   53|  RECEIVED|668615df-35aa-404...|United States Vir...|   1022|
|2024-11-29 20:22:...|   29|  RECEIVED|1cee98fd-7f3e-410...|French Southern T...|   1010|
|2024-11-29 20:22:...|   36|  RECEIVED|885b1c38-e17c-471...|Northern Mariana ...|   1008|
|2024-11-29 20:22:...|   43|      OPEN|66e23e9f-ccc5-407...|French Southern T...|   1035|
|2024-11-29 20:22:...|   39|  RECEIVED|7fa12a91-3a66-488...|Libyan Arab Jamah...|   1010|
|2024-11-29 20:22:...|   32|   CREATED|4007c051-5ebb-489...|British Virgin Is...|   1040|
|2024-11-29 20:22:...|   22|   CLICKED|67fb1601-b897-439...|Libyan Arab Jamah...|   1004|
|2024-11-29 20:22:...|   28|   CLICKED|8946a4e6-36a1-475...|Saint Kitts and N...|   1010|
|2024-11-2