In [67]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("JSON_IN_PYSPARK")
    .master("local[*]").config("spark.ui.port", "4042")
    .getOrCreate()
)
print(spark.sparkContext.uiWebUrl)
spark

http://ceewater-cpu001.unity.rc.umass.edu:4042


In [61]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, StructType, TimestampType

schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("event", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("currency", StringType(), True),
    StructField("location", StructType([
        StructField("lat", DoubleType(), True),
        StructField("long", DoubleType(), True)
    ])),
    StructField("timestamp", StringType(), True) 
])

#### Method: from_json()

In [62]:
from pyspark.sql.functions import from_json

# Sample data received from Kafka
data = [
    ('{"user_id": "user_123", "name": "Alice Johnson", "event": "transaction", "amount": 250.75, "currency": "USD", "location": {"lat": 40.7128, "long": -74.0060}, "timestamp": "2025-03-19T14:30:00"}',)
]

df = spark.createDataFrame(data, ["json_event"])

# Parse JSON
df_parsed = df.withColumn("parsed_data", from_json("json_event", schema))
df_parsed.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------+
|json_event                                                                                                                                                                                       |parsed_data                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------+
|{"user_id": "user_123", "name": "Alice Johnson", "event": "transaction", "amount": 250.75, "currency": "USD", "location": {"lat": 40

In [63]:
df_final = df_parsed.select(
    "parsed_data.user_id",
    "parsed_data.name",
    "parsed_data.event",
    "parsed_data.amount",
    "parsed_data.currency",
    "parsed_data.location.lat",
    "parsed_data.location.long",
    "parsed_data.timestamp"
)
df_final.show(truncate=False)

+--------+-------------+-----------+------+--------+-------+-------+-------------------+
|user_id |name         |event      |amount|currency|lat    |long   |timestamp          |
+--------+-------------+-----------+------+--------+-------+-------+-------------------+
|user_123|Alice Johnson|transaction|250.75|USD     |40.7128|-74.006|2025-03-19T14:30:00|
+--------+-------------+-----------+------+--------+-------+-------+-------------------+



#### Method: get_json_object()

In [64]:
from pyspark.sql.functions import get_json_object

df_extracted = df.select(
    get_json_object("json_event", "$.user_id").alias("user_id"),
    get_json_object("json_event", "$.amount").alias("transaction_amount"),
    get_json_object("json_event", "$.location.lat").alias("latitude"),
    get_json_object("json_event", "$.location.long").alias("longitude"),
    get_json_object("json_event", "$.timestamp").alias("event_time")
)

df_extracted.show(truncate=False)

+--------+------------------+--------+---------+-------------------+
|user_id |transaction_amount|latitude|longitude|event_time         |
+--------+------------------+--------+---------+-------------------+
|user_123|250.75            |40.7128 |-74.006  |2025-03-19T14:30:00|
+--------+------------------+--------+---------+-------------------+



#### Method: to_json()

In [65]:
from pyspark.sql.functions import to_json, struct

df_json = df_final.withColumn("json_output", to_json(struct(
    "user_id", "name", "event", "amount", "currency", "lat", "long", "timestamp"
)))

df_json.show(truncate=False)

+--------+-------------+-----------+------+--------+-------+-------+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id |name         |event      |amount|currency|lat    |long   |timestamp          |json_output                                                                                                                                                        |
+--------+-------------+-----------+------+--------+-------+-------+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_123|Alice Johnson|transaction|250.75|USD     |40.7128|-74.006|2025-03-19T14:30:00|{"user_id":"user_123","name":"Alice Johnson","event":"transaction","amount":250.75,"currency":"USD","lat":40.7128,"long":-74.006,"timestamp":"2025-03-19T

#### Method: json_tuple()

In [53]:
from pyspark.sql.functions import json_tuple, col

df_extracted = df.select(json_tuple(col('json_event'), 'user_id', 'event', 'amount', 'timestamp')
                         .alias('user_id', 'event_type', 'amount', 'event_time'))
df_extracted.show(truncate=False)

+--------+-----------+------+-------------------+
|user_id |event_type |amount|event_time         |
+--------+-----------+------+-------------------+
|user_123|transaction|250.75|2025-03-19T14:30:00|
+--------+-----------+------+-------------------+



#### Method: schema_of_json()

In [60]:
from pyspark.sql.functions import from_json, schema_of_json

sample_json = df.select("json_event").limit(1).collect()[0][0]
schema_inferred = df.select(schema_of_json(lit(sample_json))).collect()[0][0]
print(schema_inferred)

df_parsed = df.withColumn("parsed_data", from_json("json_event", schema_inferred))
df_parsed.select("parsed_data.*").show(truncate=False)


STRUCT<amount: DOUBLE, currency: STRING, event: STRING, location: STRUCT<lat: DOUBLE, long: DOUBLE>, name: STRING, timestamp: STRING, user_id: STRING>
+------+--------+-----------+------------------+-------------+-------------------+--------+
|amount|currency|event      |location          |name         |timestamp          |user_id |
+------+--------+-----------+------------------+-------------+-------------------+--------+
|250.75|USD     |transaction|{40.7128, -74.006}|Alice Johnson|2025-03-19T14:30:00|user_123|
+------+--------+-----------+------------------+-------------+-------------------+--------+

