- Name: 04.2_dataframe_json
- Author: Shamas Imran
- Desciption: Working with JSON files using PySpark DataFrames
- Date: 10-Oct-2025

In [None]:
# JSON string with slight schema differences (schema drift)
json_string_v1 = """
[
  {"id": 1, "name": "Alice", "age": 25, "city": "Lahore"},
  {"id": 2, "name": "Bob", "city": "Karachi"},
  {"id": 3, "name": "Charlie", "age": "30", "country": "Pakistan"}
]
"""

############# Basic Parsing
import json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType
# Parse the JSON string into Python objects (list of dicts)
data_v1 = json.loads(json_string_v1)

# Convert to a PySpark DataFrame
df_without_schema_v1 = spark.createDataFrame(data_v1)

# Display the DataFrame
display(df_without_schema_v1)
df_without_schema_v1.printSchema()
print(df_without_schema_v1.schema)

# Show the content
df_without_schema_v1.show(truncate=False)

In [None]:
############# Explicit Schema Definition

from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Explicit schema to handle drift safely
schema_v1 = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", StringType(), True),   # keep as string (for mixed types)
    StructField("city", StringType(), True),
    StructField("country", StringType(), True)
])

df_with_schema_v1 = spark.createDataFrame(data, schema=schema_v1)
df_with_schema_v1.show()
print(df_with_schema_v1.schema)

In [2]:
# Handling Nested JSON Objects
import json
from pyspark.sql.functions import explode, col

json_string_nested = """
[
  {"id": 1, "name": "Alice", "address": {"city": "Lahore", "zip": "54000"}},
  {"id": 2, "name": "Bob", "address": {"city": "Karachi", "zip": "74200"}},
  {"id": 3, "name": "Charlie", "address": {"city": "Islamabad"}}
]
"""

data_nested = json.loads(json_string_nested)
df_nested = spark.createDataFrame(data_nested)
df_nested.show(truncate=False)
# print(df_nested.schema)


df_nested_select = df_nested.select(
    col("id"),
    col("name"),
    col("address.city").alias("address_city"),
    col("address.zip").alias("address_zip")
)

df_nested_select.show()


StatementMeta(, 100ead97-d505-486f-ae33-2d2f02b4a33a, 4, Finished, Available, Finished)

+--------------------+---+-------+
|             address| id|   name|
+--------------------+---+-------+
|{zip -> 54000, ci...|  1|  Alice|
|{zip -> 74200, ci...|  2|    Bob|
| {city -> Islamabad}|  3|Charlie|
+--------------------+---+-------+

+---+-------+------------+-----------+
| id|   name|address_city|address_zip|
+---+-------+------------+-----------+
|  1|  Alice|      Lahore|      54000|
|  2|    Bob|     Karachi|      74200|
|  3|Charlie|   Islamabad|       NULL|
+---+-------+------------+-----------+



In [15]:
# Handling JSON Arrays
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode_outer
import json

json_string_array = """
[
  {"id": 1, "name": "Alice", "phones": ["0300-1111111", "0300-2222222"]},
  {"id": 2, "name": "Bob",   "phones": ["0300-3333333"]},
  {"id": 3, "name": "Charlie", "phones": []},
  {"id": 4, "name": "Diana"}
]
"""

data_array = json.loads(json_string_array)
df_array = spark.createDataFrame(data_array)

# df_array.printSchema()
# df_array.show(truncate=False)

df_array.select(
    col("id"),
    col("name"),
    col("phones")[0].alias("primary_phone"),
    col("phones")[1].alias("secondary_phone")
).show(truncate=False)


df_exploded = df_array.withColumn("phone", explode(col("phones"))) \
                      .select(col("id"), col("name"), col("phone"))
df_exploded.show()

df_exploded = df_array.withColumn("phone", explode_outer(col("phones"))) \
                      .select("id", "name", "phone")

# df_exploded.show()

# explode() vs explode_outer():
# explode()        → drops rows with empty or null arrays
# explode_outer()  → keeps them, fills phone=null


StatementMeta(, 5ca48b54-9b50-432f-b776-054b9e08cd04, 17, Finished, Available, Finished)

+---+-------+-------------+---------------+
|id |name   |primary_phone|secondary_phone|
+---+-------+-------------+---------------+
|1  |Alice  |0300-1111111 |0300-2222222   |
|2  |Bob    |0300-3333333 |NULL           |
|3  |Charlie|NULL         |NULL           |
|4  |Diana  |NULL         |NULL           |
+---+-------+-------------+---------------+

+---+-----+------------+
| id| name|       phone|
+---+-----+------------+
|  1|Alice|0300-1111111|
|  1|Alice|0300-2222222|
|  2|  Bob|0300-3333333|
+---+-----+------------+



In [6]:
#  Reading JSON from Files
df_single_line = spark.read.json("Files/client_input_data/json/single_line_students.json")
df_single_line.show()

df_multi_line = spark.read.option("multiline", "true").json("Files/client_input_data/json/multi_line_students.json")
df_multi_line.show()

output_path_single = "Files/client_output_data/json/output_single_line"
output_path_multi  = "Files/client_output_data/json/output_multi_line"

df_single_line.write.mode("overwrite").json(output_path_single)

df_multi_line.coalesce(1) \
    .write.mode("overwrite") \
    .option("multiline", "true") \
    .json(output_path_multi)

print(df_multi_line.schema.json())  # Save for later use
# df_fixed = spark.read.schema(schema).json("Files/...")

StatementMeta(, b2276c1b-c42b-4b2c-a631-836fcb7210c3, 8, Finished, Available, Finished)

+---+---+-------+
|age| id|   name|
+---+---+-------+
| 25|  1|  Alice|
| 30|  2|    Bob|
| 35|  3|Charlie|
+---+---+-------+

+---+---+--------+
|age| id|    name|
+---+---+--------+
| 35|  1|  Shamas|
| 40|  2|   Imran|
| 45|  3|Muhammad|
| 50|  4|   Irfan|
+---+---+--------+

{"fields":[{"metadata":{},"name":"age","nullable":true,"type":"long"},{"metadata":{},"name":"id","nullable":true,"type":"long"},{"metadata":{},"name":"name","nullable":true,"type":"string"}],"type":"struct"}


## 🧩 Single-line JSON vs Multi-line JSON
---
Each record (JSON object) is on **its own line**.  
This format is common in **logs**, **streaming data**, and **data lakes**.

In [None]:
# Schema Drift Over Time (Evolution)

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

spark = SparkSession.builder.appName("SchemaDriftExample").getOrCreate()

# Create first DataFrame (original schema)
data_v1 = [(1, "Ali", "Pakistan"), (2, "Sara", "USA")]
df_v1 = spark.createDataFrame(data_v1, ["id", "name", "country"])

# Write version 1 to Parquet
output_path = "Files/client_output_data/parquet/schema_drift"
df_v1.write.mode("overwrite").parquet(output_path)

# Create second DataFrame (schema evolved: new column added)
data_v2 = [(3, "Ahmed", "UK", 2025), (4, "Maria", "Canada", 2025)]
df_v2 = spark.createDataFrame(data_v2, ["id", "name", "country", "year"])

# Append version 2 to the same Parquet folder (enable schema merge)
df_v2.write.mode("append").option("mergeSchema", "true").parquet(output_path)

# Read combined data with merged schema
df_combined = spark.read.option("mergeSchema", "true").parquet(output_path)

df_combined.show()
df_combined.printSchema()


In [3]:
import json
from pyspark.sql.functions import explode, col

# 1) JSON variable (array of objects, multi-line)
json_variable = """
{
  "metadata": {
    "version": "1.0",
    "source": "user_generated",
    "created_at": "2025-10-08T18:55:00Z"
  },
  "users": [
    {
      "id": 1,
      "name": "Alice",
      "age": 30,
      "address": {"street": "Main St", "city": "NY"},
      "phones": ["123-4567", "987-6543"]
    },
    {
      "id": 2,
      "name": "Bob",
      "address": {"street": "2nd St", "city": "LA"},
      "phones": ["555-5555"],
      "email": "bob@example.com"
    }
  ]
}
"""

StatementMeta(, 100ead97-d505-486f-ae33-2d2f02b4a33a, 5, Finished, Available, Finished)

In [6]:
# Parse JSON
data = json.loads(json_variable)

print("=== Users Data ===") # array
print(data["users"])

print("=== Metadata ===") # dictionary
print(data["metadata"])

users_with_metadata = []

for user in data["users"]:
    print("Original user:", user)
    # make a copy so the original dictionary isn't changed
    new_user = user.copy()
    print("   Copied user:", new_user)
    # add metadata info to the user
    new_user.update(data["metadata"]) # merge metadata to new_user
    print("   After adding metadata:", new_user)
    # add the updated user to the final list
    users_with_metadata.append(new_user)
    print("Added to final list.\n")

print("Final users_with_metadata list:")
print(users_with_metadata)

# Flatten: merge metadata into each user
# users_with_metadata = [ {**user, **data["metadata"]} for user in data["users"] ]

# Create PySpark DataFrame
df = spark.createDataFrame(users_with_metadata)

print("=== Flattened DataFrame ===")
display(df)

StatementMeta(, 100ead97-d505-486f-ae33-2d2f02b4a33a, 8, Finished, Available, Finished)

=== Users Data ===
[{'id': 1, 'name': 'Alice', 'age': 30, 'address': {'street': 'Main St', 'city': 'NY'}, 'phones': ['123-4567', '987-6543']}, {'id': 2, 'name': 'Bob', 'address': {'street': '2nd St', 'city': 'LA'}, 'phones': ['555-5555'], 'email': 'bob@example.com'}]
=== Metadata ===
{'version': '1.0', 'source': 'user_generated', 'created_at': '2025-10-08T18:55:00Z'}
Original user: {'id': 1, 'name': 'Alice', 'age': 30, 'address': {'street': 'Main St', 'city': 'NY'}, 'phones': ['123-4567', '987-6543']}
   Copied user: {'id': 1, 'name': 'Alice', 'age': 30, 'address': {'street': 'Main St', 'city': 'NY'}, 'phones': ['123-4567', '987-6543']}
   After adding metadata: {'id': 1, 'name': 'Alice', 'age': 30, 'address': {'street': 'Main St', 'city': 'NY'}, 'phones': ['123-4567', '987-6543'], 'version': '1.0', 'source': 'user_generated', 'created_at': '2025-10-08T18:55:00Z'}
Added to final list.

Original user: {'id': 2, 'name': 'Bob', 'address': {'street': '2nd St', 'city': 'LA'}, 'phones': ['55

SynapseWidget(Synapse.DataFrame, a67daf31-ad4d-4bb5-b8f8-878d8be8df5d)