<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/examples/example_4_using_dataproc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up PySpark

In [None]:
%pip install pyspark



## Simulate producer:
- extract data from API
- store data as json in the lake
- run task async

In [None]:
import requests
from pyspark.sql.types import *
import json
import datetime
import asyncio

landing_path=f"gs://{bucket_name}/datalake/landing/{table_path}"

async def ingest_from_api(url: str, table: str, schema: StructType = None):
  response = requests.get(url)
  timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
  if response.status_code == 200:
    data = response.json()
    with open(f"{landing_path}_{int(timestamp)}.json", "w") as f:
        json.dump(data, f)

async def producer(loop: int, interval_time: int):
  for i in range(loop):
    await ingest_from_api("https://api.carrismetropolitana.pt/vehicles", "vehicles")
    await ingest_from_api("https://api.carrismetropolitana.pt/lines", "lines")
    await asyncio.sleep(interval_time)

async def main():
  asyncio.create_task(producer(10, 30))

await main()

- Read from /content/landing as streaming
- store data in memory (for testing)
- store data in the bronze layer

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('Test streaming').getOrCreate()
sc = spark.sparkContext

def insert_vehicles(df, batch_id, path):
  df.write.format("parquet").mode("append").save(path)

vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

# define paths
bucket_name="edit-data-eng-dev"
table_path="vehicles"
landing_path=f"gs://{bucket_name}/datalake/landing/{table_path}"
bronze_path=f"gs://{bucket_name}/datalake/bronze/{table_path}"

stream = spark.readStream.format("parquet").schema(schema).load(landing_path)

query = (stream
          .writeStream
          .outputMode("append")
          .foreachBatch(insert_vehicles(bronze_path))
          .option("checkpointLocation", "/content/bronze/checkpoint")
          .trigger(processingTime='20 seconds')
          .start()
          .awaitTermination(60)

In [99]:
!rm -rf /content/content

In [82]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker

def insert_into_table(df, batch_id):
  fake = Faker()
  new_columns = {
      'name': F.lit(fake.name()),
      'address': F.lit(fake.address()),
      'email': F.lit(fake.email()),
      'dob': F.lit(fake.date_of_birth()),
      'phone': F.lit(fake.phone_number())
  }
  df = df.withColumns(new_columns)
  df.write.mode("append").format("parquet").save("content/output/events")

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

# write stream
query = (df_stream.writeStream
.outputMode('append')
.trigger(processingTime='1 seconds')
.foreach(insert_into_table)
.start()
)



In [97]:
query.stop

In [87]:
df = spark.read.format("parquet").load("content/output/events")

df.show()

+--------------------+-----+--------------------+--------------------+--------------------+----------+--------------------+
|           timestamp|value|                name|             address|               email|       dob|               phone|
+--------------------+-----+--------------------+--------------------+--------------------+----------+--------------------+
|2024-11-29 19:55:...| 1465|Mrs. Kimberly Stuart|19491 Makayla Loc...|zacharylawson@exa...|2000-03-11|001-981-340-6489x061|
|2024-11-29 19:56:...| 1486|   Jessica Carpenter|3118 Whitaker Sho...|smithhenry@exampl...|1913-06-16|  442.965.8651x17486|
|2024-11-29 19:55:...| 1419|        Donald Green|88028 Daniel Trai...|jasonmclaughlin@e...|1918-07-21|+1-707-706-2607x1...|
|2024-11-29 19:55:...| 1455| Christopher Ray Jr.|702 Ramirez Villa...|lucascourtney@exa...|1959-04-17|001-875-201-4285x717|
|2024-11-29 19:56:...| 1510|Mr. Xavier Jacobs MD|5873 Chan Plain S...|howelljohn@exampl...|1916-03-14|     +1-216-896-0881|
|2024-11

In [None]:
timestamp
id
message_type (OPEN, RECEIVED, SENT, CREATED)
message_id
user

In [94]:
query.stop

In [None]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker

def insert_into_table(df, batch_id):
  fake = Faker()
  new_columns = {
      'event_type': fake.random_element(elements=('OPEN', 'RECEIVED', 'SENT', 'CREATED', 'CLICKED')),
      'event_id': fake.uuid4(),
      'country': fake.country(),
      'user_id': fake.random_int(min=1000, max=1050),
  }
  df = df.withColumns(new_columns)
  df.write.mode("append").format("parquet").save("content/output/messages")

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

# write stream
query = (df_stream.writeStream
.outputMode('append')
.trigger(processingTime='1 seconds')
.foreach(insert_into_table)
.start()
)

