<a href="https://colab.research.google.com/github/sachins301/UTA-Distributed-Computing/blob/main/UTA_Spark_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing Spark and dependencies

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.2-bin-hadoop3"

In [4]:
import findspark
findspark.init()


In [5]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType, BooleanType
from pyspark.sql.functions import col, explode, input_file_name, regexp_extract


In [None]:
# spark = SparkSession.builder.master("local[*]").getOrCreate()
# spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
# spark

In [6]:
# Unpack the json dump
import shutil
shutil.unpack_archive('/content/json_dumps.zip', '/content/json_dumps/')

In [7]:
import time
def timing_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()  # Start time
        result = func(*args, **kwargs)
        end_time = time.time()  # End time
        elapsed_time = end_time - start_time
        print(f"Function '{func.__name__}' took {elapsed_time:.4f} seconds to execute.")
        return result
    return wrapper

In [None]:
@timing_decorator
# def read_json():
#   df = spark.read.json("/content/json_dumps/*.json")
#   return df

# df = read_json()
# df.show(5)

Function 'read_json' took 144.7838 seconds to execute.
+--------------------+
|                Siri|
+--------------------+
|{1.3, http://www....|
|{1.3, http://www....|
|{1.3, http://www....|
|{1.3, http://www....|
|{1.3, http://www....|
+--------------------+
only showing top 5 rows



In [None]:
@timing_decorator
# def read_json():
#   df = spark.read.json("/content/json_dumps/*.json").repartition(100)
#   return df

# df = read_json()
# df.show(5)


Function 'read_json' took 144.8710 seconds to execute.
+--------------------+
|                Siri|
+--------------------+
|{1.3, http://www....|
|{1.3, http://www....|
|{1.3, http://www....|
|{1.3, http://www....|
|{1.3, http://www....|
+--------------------+
only showing top 5 rows



In [None]:
@timing_decorator
# def read_json():
#   df = spark.read.option("wholeFile", True).json("/content/json_dumps/*.json").repartition(100)
#   return df

# df = read_json()
# df.show(5)

Function 'read_json' took 138.6985 seconds to execute.
+--------------------+
|                Siri|
+--------------------+
|{1.3, http://www....|
|{1.3, http://www....|
|{1.3, http://www....|
|{1.3, http://www....|
|{1.3, http://www....|
+--------------------+
only showing top 5 rows



In [None]:
@timing_decorator
def read_json():
  df = spark.read \
            .option("wholeFile", True) \
            .json("/content/json_dumps/*.json").repartition(100) \
            .coalesce(10)
  return df

df = read_json()
df.show(5)

In [11]:
# spark.stop()
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()
spark

In [12]:
# Define the schema for the nested JSON structure
schema = StructType([
    StructField("Siri", StructType([
        StructField("@version", StringType(), True),
        StructField("@xmlns", StringType(), True),
        StructField("ResponseTimestamp", StringType(), True),
        StructField("VehicleMonitoringDelivery", StructType([
            StructField("@version", StringType(), True),
            StructField("ResponseTimestamp", StringType(), True),
            StructField("ValidUntil", StringType(), True),
            StructField("VehicleActivity", StructType([
                StructField("RecordedAtTime", StringType(), True),
                StructField("MonitoredVehicleJourney", ArrayType(StructType([
                    StructField("LineRef", StringType(), True),
                    StructField("DirectionRef", StringType(), True),
                    StructField("FramedVehicleJourneyRef", StructType([
                        StructField("DataFrameRef", StringType(), True),
                        StructField("DatedVehicleJourneyRef", StringType(), True)
                    ]), True),
                    StructField("PublishedLineName", StringType(), True),
                    StructField("OriginRef", StringType(), True),
                    StructField("DestinationRef", StringType(), True),
                    StructField("Monitored", StringType(), True),
                    StructField("VehicleLocation", StructType([
                        StructField("Longitude", StringType(), True),
                        StructField("Latitude", StringType(), True)
                    ]), True),
                    StructField("ProgressRate", StringType(), True),
                    StructField("CourseOfJourneyRef", StringType(), True),
                    StructField("VehicleRef", StringType(), True),
                    StructField("MonitoredCall", StructType([
                        StructField("StopPointRef", StringType(), True),
                        StructField("VisitNumber", StringType(), True),
                        StructField("VehicleAtStop", StringType(), True)
                    ]), True),
                    StructField("Extensions", StructType([
                        StructField("LastGPSFix", StringType(), True),
                        StructField("Scheduled", StringType(), True),
                        StructField("Bearing", StringType(), True),
                        StructField("Speed", StringType(), True),
                        StructField("DestinationName", StringType(), True)
                    ]), True)
                ])), True)
            ]), True)
        ]), True)
    ]), True)
])

In [13]:
@timing_decorator
def read_json():
    df = spark.read.option("wholeFile", True) \
        .schema(schema) \
        .json("/content/json_dumps/*.json") \
        .withColumn("file_id", input_file_name()) \
        .repartition(100)
    return df

df = read_json().persist()
df.show(5)

Function 'read_json' took 89.3516 seconds to execute.
+--------------------+--------------------+
|                Siri|             file_id|
+--------------------+--------------------+
|{1.3, http://www....|file:///content/j...|
|{1.3, http://www....|file:///content/j...|
|{1.3, http://www....|file:///content/j...|
|{1.3, http://www....|file:///content/j...|
|{1.3, http://www....|file:///content/j...|
+--------------------+--------------------+
only showing top 5 rows



In [None]:
df.select("Siri.VehicleMonitoringDelivery.VehicleActivity.MonitoredVehicleJourney.Extensions").show(5, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
flattened_df = df.select(
    "file_id",
    col("Siri.@version").alias("Siri_version"),
    col("Siri.@xmlns").alias("Siri_xmlns"),
    col("Siri.ResponseTimestamp").alias("Siri_ResponseTimestamp"),
    col("Siri.VehicleMonitoringDelivery.@version").alias("VehicleMonitoringDelivery_version"),
    col("Siri.VehicleMonitoringDelivery.ResponseTimestamp").alias("VehicleMonitoringDelivery_ResponseTimestamp"),
    col("Siri.VehicleMonitoringDelivery.ValidUntil").alias("VehicleMonitoringDelivery_ValidUntil"),
    col("Siri.VehicleMonitoringDelivery.VehicleActivity.RecordedAtTime").alias("RecordedAtTime"),
    explode("Siri.VehicleMonitoringDelivery.VehicleActivity.MonitoredVehicleJourney").alias("MonitoredVehicleJourney")
)

In [15]:
final_df = flattened_df.select(
    "file_id",
    "Siri_version", "Siri_xmlns", "Siri_ResponseTimestamp",
    "VehicleMonitoringDelivery_version", "VehicleMonitoringDelivery_ResponseTimestamp",
    "VehicleMonitoringDelivery_ValidUntil", "RecordedAtTime",
    col("MonitoredVehicleJourney.LineRef").alias("LineRef"),
    col("MonitoredVehicleJourney.DirectionRef").alias("DirectionRef"),
    col("MonitoredVehicleJourney.FramedVehicleJourneyRef.DataFrameRef").alias("DataFrameRef"),
    col("MonitoredVehicleJourney.FramedVehicleJourneyRef.DatedVehicleJourneyRef").alias("DatedVehicleJourneyRef"),
    col("MonitoredVehicleJourney.PublishedLineName").alias("PublishedLineName"),
    col("MonitoredVehicleJourney.OriginRef").alias("OriginRef"),
    col("MonitoredVehicleJourney.DestinationRef").alias("DestinationRef"),
    col("MonitoredVehicleJourney.Monitored").alias("Monitored"),
    col("MonitoredVehicleJourney.VehicleLocation.Longitude").alias("VehicleLocation_Longitude"),
    col("MonitoredVehicleJourney.VehicleLocation.Latitude").alias("VehicleLocation_Latitude"),
    col("MonitoredVehicleJourney.ProgressRate").alias("ProgressRate"),
    col("MonitoredVehicleJourney.CourseOfJourneyRef").alias("CourseOfJourneyRef"),
    col("MonitoredVehicleJourney.VehicleRef").alias("VehicleRef"),
    col("MonitoredVehicleJourney.MonitoredCall.StopPointRef").alias("MonitoredCall_StopPointRef"),
    col("MonitoredVehicleJourney.MonitoredCall.VisitNumber").alias("MonitoredCall_VisitNumber"),
    col("MonitoredVehicleJourney.MonitoredCall.VehicleAtStop").alias("MonitoredCall_VehicleAtStop"),
    col("MonitoredVehicleJourney.Extensions.LastGPSFix").alias("Extensions_LastGPSFix"),
    col("MonitoredVehicleJourney.Extensions.Scheduled").alias("Extensions_Scheduled"),
    col("MonitoredVehicleJourney.Extensions.Bearing").alias("Extensions_Bearing"),
    col("MonitoredVehicleJourney.Extensions.Speed").alias("Extensions_Speed"),
    col("MonitoredVehicleJourney.Extensions.DestinationName").alias("Extensions_DestinationName")
)

In [16]:
final_df = final_df.withColumn("file_id", regexp_extract("file_id", r"([^/]+)(?=\.json$)", 1))
final_df.show(5)

+--------------+------------+--------------------+----------------------+---------------------------------+-------------------------------------------+------------------------------------+--------------------+-------+--------------+--------------------+----------------------+--------------------+---------+--------------+---------+-------------------------+------------------------+------------+------------------+----------+--------------------------+-------------------------+---------------------------+---------------------+--------------------+------------------+-------------------+--------------------------+
|       file_id|Siri_version|          Siri_xmlns|Siri_ResponseTimestamp|VehicleMonitoringDelivery_version|VehicleMonitoringDelivery_ResponseTimestamp|VehicleMonitoringDelivery_ValidUntil|      RecordedAtTime|LineRef|  DirectionRef|        DataFrameRef|DatedVehicleJourneyRef|   PublishedLineName|OriginRef|DestinationRef|Monitored|VehicleLocation_Longitude|VehicleLocation_Latitude|

In [17]:
final_df.filter(col("file_id") == "455-1723569052").show(100, False)

+--------------+------------+---------------------------+---------------------------------+---------------------------------+-------------------------------------------+------------------------------------+---------------------------------+-------+--------------+-------------------------+----------------------+-----------------------+---------+--------------+---------+-------------------------+------------------------+------------+------------------+----------+--------------------------+-------------------------+---------------------------+-----------------------+--------------------+------------------+-------------------+--------------------------+
|file_id       |Siri_version|Siri_xmlns                 |Siri_ResponseTimestamp           |VehicleMonitoringDelivery_version|VehicleMonitoringDelivery_ResponseTimestamp|VehicleMonitoringDelivery_ValidUntil|RecordedAtTime                   |LineRef|DirectionRef  |DataFrameRef             |DatedVehicleJourneyRef|PublishedLineName      |Origin

## Duck DB Installation

In [1]:
pip install duckdb



In [None]:
import duckdb