- **Name:** 001_Introduction
- **Author:** Shamas Imran
- **Desciption:** Read JSON from Variable and Create DataFrame 
<!--
Version          Date        Author           Desciption
01           19-Aug-2025   Shamas Imran       Handles simple, nested, array, and schema drift JSON 
-->

In [0]:
import json
from pyspark.sql.functions import explode, col

# 1) JSON variable (array of objects, multi-line)
json_variable = """
[
  {
    "id": 1,
    "name": "Alice",
    "age": 30,
    "address": {"street": "Main St", "city": "NY"},
    "phones": ["123-4567", "987-6543"]
  },
  {
    "id": 2,
    "name": "Bob",
    "address": {"street": "2nd St", "city": "LA"},
    "phones": ["555-5555"],
    "email": "bob@example.com"
  }
]
"""

In [0]:
# 2) Parse JSON string and create DataFrame
data = json.loads(json_variable)
df = spark.createDataFrame(data)

print("=== Raw DataFrame ===")
display(df)
df.printSchema()

In [0]:
# 3) Flatten nested JSON and explode arrays
if "phones" in df.columns:
    df_exploded = df.withColumn("phone", explode(col("phones")))
else:
    df_exploded = df

if "address" in df.columns:
    df_flat = df_exploded.select(
        "id",
        "name",
        "age",
        "email",
        col("address.street").alias("street"),
        col("address.city").alias("city"),
        "phone"
    )
else:
    df_flat = df_exploded

print("=== Flattened DataFrame ===")
display(df_flat)

In [0]:
# 4) Handle schema drift with another JSON variable
json_variable2 = """
[
  {
    "id": 3,
    "name": "Charlie",
    "age": 28,
    "gender": "M"
  }
]
"""

data2 = json.loads(json_variable2)
df2 = spark.createDataFrame(data2)

df_union = df_flat.unionByName(df2, allowMissingColumns=True)

print("=== Union with Schema Drift ===")
display(df_union)
df_union.printSchema()