In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Spark SQL Array & Map Functions").getOrCreate()


In [2]:
data = [
    ("Alice", ["Math", "Science", "English"], {"Math": 90, "Science": 85}),
    ("Bob", ["History", "Math"], {"History": 88}),
    ("Charlie", ["English", "Science"], {"Science": 91, "English": 89})
]

columns = ["name", "subjects", "scores"]

df = spark.createDataFrame(data, columns)
df.show(truncate=False)


+-------+------------------------+------------------------------+
|name   |subjects                |scores                        |
+-------+------------------------+------------------------------+
|Alice  |[Math, Science, English]|{Science -> 85, Math -> 90}   |
|Bob    |[History, Math]         |{History -> 88}               |
|Charlie|[English, Science]      |{Science -> 91, English -> 89}|
+-------+------------------------+------------------------------+



In [3]:
df_array = df.select(
    "name",
    "subjects",
    size("subjects").alias("num_subjects"),
    array_contains("subjects", "Math").alias("has_math"),
    element_at("subjects", 2).alias("second_subject"),
    slice("subjects", 1, 2).alias("first_two_subjects"),
    explode("subjects").alias("each_subject")
)

df_array.show(truncate=False)


+-------+------------------------+------------+--------+--------------+------------------+------------+
|name   |subjects                |num_subjects|has_math|second_subject|first_two_subjects|each_subject|
+-------+------------------------+------------+--------+--------------+------------------+------------+
|Alice  |[Math, Science, English]|3           |true    |Science       |[Math, Science]   |Math        |
|Alice  |[Math, Science, English]|3           |true    |Science       |[Math, Science]   |Science     |
|Alice  |[Math, Science, English]|3           |true    |Science       |[Math, Science]   |English     |
|Bob    |[History, Math]         |2           |true    |Math          |[History, Math]   |History     |
|Bob    |[History, Math]         |2           |true    |Math          |[History, Math]   |Math        |
|Charlie|[English, Science]      |2           |false   |Science       |[English, Science]|English     |
|Charlie|[English, Science]      |2           |false   |Science 

In [4]:
df_map = df.select(
    "name",
    "scores",
    map_keys("scores").alias("subjects_in_map"),
    map_values("scores").alias("marks"),
    element_at("scores", "Math").alias("math_score")
)

df_map.show(truncate=False)


+-------+------------------------------+------------------+--------+----------+
|name   |scores                        |subjects_in_map   |marks   |math_score|
+-------+------------------------------+------------------+--------+----------+
|Alice  |{Science -> 85, Math -> 90}   |[Science, Math]   |[85, 90]|90        |
|Bob    |{History -> 88}               |[History]         |[88]    |NULL      |
|Charlie|{Science -> 91, English -> 89}|[Science, English]|[91, 89]|NULL      |
+-------+------------------------------+------------------+--------+----------+

