<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/examples/example_3_api_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up PySpark

In [1]:
%pip install pyspark



In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()
sc = spark.sparkContext

In [13]:
import datetime
datetime.datetime.now().strftime("%Y%m%d%H%M%S")

'20241128002111'

In [72]:
!rm -rf /content/landing
!mkdir -p /content/landing

In [74]:
import requests
from pyspark.sql.types import *
import json
import datetime
import asyncio

async def ingest_from_api(url: str, table: str, schema: StructType = None):
  response = requests.get(url)
  timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
  if response.status_code == 200:
    data = response.json()
    with open(f"/content/landing/{table}_{int(timestamp)}.json", "w") as f:
        json.dump(data, f)

async def producer(loop: int, interval_time: int):
  for i in range(loop):
    await ingest_from_api("https://api.carrismetropolitana.pt/vehicles", "vehicles")
    await ingest_from_api("https://api.carrismetropolitana.pt/lines", "lines")
    await asyncio.sleep(interval_time)

async def main():
  asyncio.create_task(producer(10, 30))

await main()

In [75]:
from pyspark.sql.types import *

vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

stream = spark.readStream.format("json").schema(vehicle_schema).load("/content/landing/vehicles*")

dedup = stream.dropDuplicates()

query = (dedup.writeStream.format("memory").option("queryName", "vehicles").start())

IllegalArgumentException: Cannot start query with name vehicles as a query with that name is already active in this SparkSession

In [71]:
query.stop()

In [85]:
spark.sql("select count(1) from vehicles").show()

+--------+
|count(1)|
+--------+
|     181|
+--------+



In [84]:
spark.read.format("json").schema(vehicle_schema).load("/content/landing/vehicles*").count()

184

In [70]:
spark.sql("select * from vehicles").show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|    212|             1080-11| IN_TRANSIT_TO|  42|263| 38.90714|   2303|-9.051232|  2303_0_1|  2303_0|            SCHEDULED|        1206|4.1666665| 180131|2024-11-28 00:46:44|2303_0_1|1|1|2430...|
|    170|       ESC_DU_EU2058|    STOPPED_AT| 43|2121|38.695377|   3708|-9.178453|  3708_0_2|  3708_0|            SCHEDULED|      EU2207| 16.11111| 060019|2024-11-28 00:47:57|3708_0_2_2430_245...|
|    106|      

In [None]:
# question
# aggregate by 5 minute window time
# count vehicles by status (INCOMING, STOPPED, INTRANSIT)

In [None]:
# question 2
# enrich latitude and longitude
# join dataframes