- **Name:** 17_dataframe_complex_types
- **Author:** Shamas Imran
- **Desciption:** Working with complex data types (arrays, maps, structs)
- **Date:** 19-Aug-2025
<!--
REVISION HISTORY
Version          Date        Author           Desciption
01           19-Aug-2025   Shamas Imran       Created DataFrames with arrays and structs  
                                              Accessed nested fields  
                                              Applied explode on arrays  
-->

In [0]:
from pyspark.sql import functions as F

In [0]:
                                                        # ===============================
                                                        # 1. ARRAYS
                                                        # ===============================


# Create a sample DataFrame with an array column
df_array = spark.createDataFrame([
    (1, ["python", "spark", "sql", "pandas", "scala"]),
    (2, ["azure", "databricks"])
], ["id", "skills"])

df_array.show(truncate=False)   # Display arrays as-is

# Explode array into multiple rows (1 row per skill)
df_array_exploded = df_array.withColumn("skill", F.explode("skills"))
df_array_exploded.show()

# Use array functions: size (count elements), array_contains (search element)
df_array.select(
    "id",
    F.size("skills").alias("skill_count"),               # count elements in array
    F.array_contains("skills", "spark").alias("knows_spark")  # check if "spark" is present
).show() # Each column is truncated to 20 characters (longer text gets cut with ...).

In [0]:
                                                        # ===============================
                                                        # 2. STRUCTS
                                                        # ===============================

# Create a sample DataFrame with tuples that we will convert to struct
df_struct = spark.createDataFrame([
    (1, ("John", "Doe")),
    (2, ("Jane", "Smith"))
], ["id", "name"])

# Convert tuple into a struct with named fields: first, last
df_struct = df_struct.withColumn("name_struct", 
    F.struct(F.col("name._1").alias("first"), F.col("name._2").alias("last"))
)

# Access struct fields using dot notation
df_struct.select("id", "name_struct.first", "name_struct.last").show()

# OR using dot notation in string
df_struct.filter("name_struct.last = 'Doe'").show()

In [0]:
                                                                    # ===============================
                                                                    # 3. JSON
                                                                    # ===============================

# Create a sample DataFrame with JSON string column
df_json = spark.createDataFrame([
    (1, '{"city":"New York","zip":10001}'),
    (2, '{"city":"LA","zip":90001}')
], ["id", "address_json"])

# Parse JSON string into a struct with schema (city, zip)
df_parsed = df_json.withColumn("address", 
    F.from_json("address_json", "city STRING, zip INT")
)

# Access fields inside the parsed struct
df_parsed.select("id", "address.city", "address.zip").show()

# Convert struct back to JSON string
df_tojson = df_parsed.withColumn("json_again", F.to_json("address"))
df_tojson.show(truncate=False)