In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName("explode vs flattern").getOrCreate()

In [7]:
spark

In [8]:
from pyspark.sql.functions import flatten, explode

In [9]:
recs_df = spark.createDataFrame(((1, [["prod1", "prod2"], ["prod3"]]), (2, [["prod4", "prod5"], ["prod6"]])), ['userid', 'prodRecommendations'])

In [10]:
recs_df.show(truncate=False)

+------+-------------------------+
|userid|prodRecommendations      |
+------+-------------------------+
|1     |[[prod1, prod2], [prod3]]|
|2     |[[prod4, prod5], [prod6]]|
+------+-------------------------+



In [11]:
recsFlatten = recs_df.withColumn("flattenedOutput", flatten(recs_df.prodRecommendations))
recsFlatten.show(truncate=False)

+------+-------------------------+---------------------+
|userid|prodRecommendations      |flattenedOutput      |
+------+-------------------------+---------------------+
|1     |[[prod1, prod2], [prod3]]|[prod1, prod2, prod3]|
|2     |[[prod4, prod5], [prod6]]|[prod4, prod5, prod6]|
+------+-------------------------+---------------------+



In [12]:
explodedDF1 = recs_df.withColumns({"Recommendations_1": explode(recs_df.prodRecommendations)})
explodedDF1.show(truncate=False)
explodedDF1.printSchema()

+------+-------------------------+-----------------+
|userid|prodRecommendations      |Recommendations_1|
+------+-------------------------+-----------------+
|1     |[[prod1, prod2], [prod3]]|[prod1, prod2]   |
|1     |[[prod1, prod2], [prod3]]|[prod3]          |
|2     |[[prod4, prod5], [prod6]]|[prod4, prod5]   |
|2     |[[prod4, prod5], [prod6]]|[prod6]          |
+------+-------------------------+-----------------+

root
 |-- userid: long (nullable = true)
 |-- prodRecommendations: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Recommendations_1: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [35]:
explodedDF2 = explodedDF1.withColumns({"Recommendations_2": explode(explodedDF1.Recommendations_1)}).select("userid", "prodRecommendations", "Recommendations_2")
explodedDF2.show(truncate=False)

+------+-------------------------+-----------------+
|userid|prodRecommendations      |Recommendations_2|
+------+-------------------------+-----------------+
|1     |[[prod1, prod2], [prod3]]|prod1            |
|1     |[[prod1, prod2], [prod3]]|prod2            |
|1     |[[prod1, prod2], [prod3]]|prod3            |
|2     |[[prod4, prod5], [prod6]]|prod4            |
|2     |[[prod4, prod5], [prod6]]|prod5            |
|2     |[[prod4, prod5], [prod6]]|prod6            |
+------+-------------------------+-----------------+



In [37]:
# Records with Empty Arrays
nullable_df = spark.createDataFrame([
    (1, ["apple", "banana"]),
    (2, []),
    (3, None)], ["id", "fruits"])

In [39]:
nullable_df.show(truncate=False)

+---+---------------+
|id |fruits         |
+---+---------------+
|1  |[apple, banana]|
|2  |[]             |
|3  |NULL           |
+---+---------------+



In [43]:
nullable_exploded = nullable_df.select("id", explode("fruits").alias("fruit"))
nullable_exploded.show(truncate=False)  # records 2 and 3 are not shown as they have empty arrays

+---+------+
|id |fruit |
+---+------+
|1  |apple |
|1  |banana|
+---+------+



In [45]:
# using explode_outer for such cases
from pyspark.sql.functions import explode_outer
retain_null_exploded = nullable_df.select("id", explode_outer("fruits").alias("fruit"))
retain_null_exploded.show(truncate=False)

+---+------+
|id |fruit |
+---+------+
|1  |apple |
|1  |banana|
|2  |NULL  |
|3  |NULL  |
+---+------+

