<a href="https://colab.research.google.com/github/rahulrajpr/prepare-anytime/blob/main/spark/functions/15_spark_sql_generator_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Spark Generator Functions**
https://spark.apache.org/docs/latest/sql-ref-functions-builtin.html#generator-functions

In [None]:
# Install Java and PySpark
!apt-get update -qq
!apt-get install -y openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark -q

# Set Java home
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [None]:
import pyspark
print(pyspark.__version__)

3.5.1


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-functions').getOrCreate()

In [None]:
from pyspark.sql.types import StructType,StructField, IntegerType,StringType,ArrayType,MapType

In [None]:
data = [
    (1, ["apple", "banana", "cherry"], {"name": "John", "age": "30"}, [("math", 90), ("science", 85)]),
    (2, ["date", "elderberry", "fig"], {"name": "Jane", "city": "NYC"}, [("history", 78), ("art", 92)]),
    (3, [], {"score": "100"}, []),
    (4, None, None, None),
    (5, ["grape", "honeydew"], {"name": "Mike", "department": "IT", "role": "developer"}, [("programming", 95), ("databases", 88)]),
    (6, ["kiwi", "lemon", "mango", "nectarine"], {"name": "Sarah", "age": "28", "city": "Boston"}, [("physics", 87), ("chemistry", 91), ("biology", 84)]),
    (7, ["orange"], {"status": "active", "level": "senior"}, [("communication", 79)]),
    (8, ["peach", "quince"], {"name": "Tom", "experience": "5 years"}, []),
    (9, ["raspberry", "strawberry", "blueberry", "blackberry"], {"team": "analytics", "projects": "3"}, [("statistics", 89), ("machine_learning", 94), ("visualization", 86)]),
    (10, None, {"name": "Lisa", "age": "35"}, [("economics", 82), ("finance", 90), ("marketing", 85), ("management", 88)])
]

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("fruits", ArrayType(StringType()), True),
    StructField("properties", MapType(StringType(), StringType()), True),
    StructField("scores", ArrayType(StructType([
        StructField("subject", StringType(), True),
        StructField("score", IntegerType(), True)
    ])), True)
])

dataframe = spark.createDataFrame(data, schema)
dataframe.printSchema()
dataframe.show(truncate = False)

root
 |-- id: integer (nullable = true)
 |-- fruits: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- scores: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- subject: string (nullable = true)
 |    |    |-- score: integer (nullable = true)

+---+----------------------------------------------+---------------------------------------------------+-------------------------------------------------------------------+
|id |fruits                                        |properties                                         |scores                                                             |
+---+----------------------------------------------+---------------------------------------------------+-------------------------------------------------------------------+
|1  |[apple, banana, cherry]                       |{name -> 

In [None]:
dataframe.createOrReplaceTempView('dataframe_view')

In [None]:
# explode : (use case for array)

sql = '''
select id, explode(fruits) as fruits
from dataframe_view
'''
spark.sql(sql).show(truncate = False)

+---+----------+
|id |fruits    |
+---+----------+
|1  |apple     |
|1  |banana    |
|1  |cherry    |
|2  |date      |
|2  |elderberry|
|2  |fig       |
|5  |grape     |
|5  |honeydew  |
|6  |kiwi      |
|6  |lemon     |
|6  |mango     |
|6  |nectarine |
|7  |orange    |
|8  |peach     |
|8  |quince    |
|9  |raspberry |
|9  |strawberry|
|9  |blueberry |
|9  |blackberry|
+---+----------+



In [None]:
# explode (use case for array)

# pyspark (with expr)

from pyspark.sql.functions import col, expr
dataframe.withColumn('fruits',expr('''explode(fruits)''')).select(['id','fruits']).show(truncate = False)

+---+----------+
|id |fruits    |
+---+----------+
|1  |apple     |
|1  |banana    |
|1  |cherry    |
|2  |date      |
|2  |elderberry|
|2  |fig       |
|5  |grape     |
|5  |honeydew  |
|6  |kiwi      |
|6  |lemon     |
|6  |mango     |
|6  |nectarine |
|7  |orange    |
|8  |peach     |
|8  |quince    |
|9  |raspberry |
|9  |strawberry|
|9  |blueberry |
|9  |blackberry|
+---+----------+



In [None]:
# explode (use case for array)

# pyspark

from pyspark.sql.functions import explode, col
dataframe.withColumn('fruits', explode(col('fruits'))).select(['id','fruits']).show(truncate = False)

+---+----------+
|id |fruits    |
+---+----------+
|1  |apple     |
|1  |banana    |
|1  |cherry    |
|2  |date      |
|2  |elderberry|
|2  |fig       |
|5  |grape     |
|5  |honeydew  |
|6  |kiwi      |
|6  |lemon     |
|6  |mango     |
|6  |nectarine |
|7  |orange    |
|8  |peach     |
|8  |quince    |
|9  |raspberry |
|9  |strawberry|
|9  |blueberry |
|9  |blackberry|
+---+----------+



In [None]:
# explode (use case for map)

sql = '''
select id, explode(properties) as (key,value)
from dataframe_view
'''
spark.sql(sql).show(truncate = False)

+---+----------+---------+
|id |key       |value    |
+---+----------+---------+
|1  |name      |John     |
|1  |age       |30       |
|2  |name      |Jane     |
|2  |city      |NYC      |
|3  |score     |100      |
|5  |name      |Mike     |
|5  |role      |developer|
|5  |department|IT       |
|6  |name      |Sarah    |
|6  |city      |Boston   |
|6  |age       |28       |
|7  |level     |senior   |
|7  |status    |active   |
|8  |name      |Tom      |
|8  |experience|5 years  |
|9  |projects  |3        |
|9  |team      |analytics|
|10 |name      |Lisa     |
|10 |age       |35       |
+---+----------+---------+



In [None]:
# explode (use case for map)
# pyspark (with SelectExpr)

from pyspark.sql.functions import col,expr

dataframe.selectExpr('id', 'explode(properties) as (key, value)')\
         .orderBy('id','key','value')\
         .show(truncate = False)

+---+----------+---------+
|id |key       |value    |
+---+----------+---------+
|1  |age       |30       |
|1  |name      |John     |
|2  |city      |NYC      |
|2  |name      |Jane     |
|3  |score     |100      |
|5  |department|IT       |
|5  |name      |Mike     |
|5  |role      |developer|
|6  |age       |28       |
|6  |city      |Boston   |
|6  |name      |Sarah    |
|7  |level     |senior   |
|7  |status    |active   |
|8  |experience|5 years  |
|8  |name      |Tom      |
|9  |projects  |3        |
|9  |team      |analytics|
|10 |age       |35       |
|10 |name      |Lisa     |
+---+----------+---------+



In [None]:
# explode (use case for map)

from pyspark.sql.functions import col,expr
dataframe.select('id',explode('properties').alias('key','value'))\
         .orderBy('id','key','value')\
         .show(truncate = False)

+---+----------+---------+
|id |key       |value    |
+---+----------+---------+
|1  |age       |30       |
|1  |name      |John     |
|2  |city      |NYC      |
|2  |name      |Jane     |
|3  |score     |100      |
|5  |department|IT       |
|5  |name      |Mike     |
|5  |role      |developer|
|6  |age       |28       |
|6  |city      |Boston   |
|6  |name      |Sarah    |
|7  |level     |senior   |
|7  |status    |active   |
|8  |experience|5 years  |
|8  |name      |Tom      |
|9  |projects  |3        |
|9  |team      |analytics|
|10 |age       |35       |
|10 |name      |Lisa     |
+---+----------+---------+



In [None]:
# explode_outer : includes the null array and null maps where Normal Explode ignore the null arrays/maps
# array

from pyspark.sql.functions import explode_outer, col
dataframe.withColumn('fruits', explode_outer(col('fruits')))\
         .select(['id','fruits'])\
         .show(truncate = False)

# map
from pyspark.sql.functions import explode_outer, col,expr
dataframe.select('id',explode_outer('properties').alias('key','value'))\
         .orderBy('id','key','value')\
         .show(truncate = False)

+---+----------+
|id |fruits    |
+---+----------+
|1  |apple     |
|1  |banana    |
|1  |cherry    |
|2  |date      |
|2  |elderberry|
|2  |fig       |
|3  |NULL      |
|4  |NULL      |
|5  |grape     |
|5  |honeydew  |
|6  |kiwi      |
|6  |lemon     |
|6  |mango     |
|6  |nectarine |
|7  |orange    |
|8  |peach     |
|8  |quince    |
|9  |raspberry |
|9  |strawberry|
|9  |blueberry |
+---+----------+
only showing top 20 rows

+---+----------+---------+
|id |key       |value    |
+---+----------+---------+
|1  |age       |30       |
|1  |name      |John     |
|2  |city      |NYC      |
|2  |name      |Jane     |
|3  |score     |100      |
|4  |NULL      |NULL     |
|5  |department|IT       |
|5  |name      |Mike     |
|5  |role      |developer|
|6  |age       |28       |
|6  |city      |Boston   |
|6  |name      |Sarah    |
|7  |level     |senior   |
|7  |status    |active   |
|8  |experience|5 years  |
|8  |name      |Tom      |
|9  |projects  |3        |
|9  |team      |analytics|

###### When to Use `explode() vs explode_outer()`

###### Use `explode()`
**When you want to skip empty/null collections**

###### Use `explode_outer()`
**When you need to preserve all original rows, even if collections are empty/null**

In [None]:
# posexplode

# array
# sql

sql = '''
select id,posexplode(fruits) as (pos, fruit)
from dataframe_view
'''

spark.sql(sql).show(truncate = False)

+---+---+----------+
|id |pos|fruit     |
+---+---+----------+
|1  |0  |apple     |
|1  |1  |banana    |
|1  |2  |cherry    |
|2  |0  |date      |
|2  |1  |elderberry|
|2  |2  |fig       |
|5  |0  |grape     |
|5  |1  |honeydew  |
|6  |0  |kiwi      |
|6  |1  |lemon     |
|6  |2  |mango     |
|6  |3  |nectarine |
|7  |0  |orange    |
|8  |0  |peach     |
|8  |1  |quince    |
|9  |0  |raspberry |
|9  |1  |strawberry|
|9  |2  |blueberry |
|9  |3  |blackberry|
+---+---+----------+



In [None]:
# posexplode

# array
# pyspark with SelectExpr

dataframe.selectExpr('id','posexplode(fruits) as (pos, fruit)')\
         .orderBy('id','pos','fruit')\
         .show(truncate = False)

+---+---+----------+
|id |pos|fruit     |
+---+---+----------+
|1  |0  |apple     |
|1  |1  |banana    |
|1  |2  |cherry    |
|2  |0  |date      |
|2  |1  |elderberry|
|2  |2  |fig       |
|5  |0  |grape     |
|5  |1  |honeydew  |
|6  |0  |kiwi      |
|6  |1  |lemon     |
|6  |2  |mango     |
|6  |3  |nectarine |
|7  |0  |orange    |
|8  |0  |peach     |
|8  |1  |quince    |
|9  |0  |raspberry |
|9  |1  |strawberry|
|9  |2  |blueberry |
|9  |3  |blackberry|
+---+---+----------+



In [None]:
# posexplode

# array
# pyspark with select + expr


dataframe.select('id',expr('posexplode(fruits)').alias('pos','fruit'))\
         .orderBy('id','pos','fruit')\
         .show(truncate = False)

+---+---+----------+
|id |pos|fruit     |
+---+---+----------+
|1  |0  |apple     |
|1  |1  |banana    |
|1  |2  |cherry    |
|2  |0  |date      |
|2  |1  |elderberry|
|2  |2  |fig       |
|5  |0  |grape     |
|5  |1  |honeydew  |
|6  |0  |kiwi      |
|6  |1  |lemon     |
|6  |2  |mango     |
|6  |3  |nectarine |
|7  |0  |orange    |
|8  |0  |peach     |
|8  |1  |quince    |
|9  |0  |raspberry |
|9  |1  |strawberry|
|9  |2  |blueberry |
|9  |3  |blackberry|
+---+---+----------+



In [None]:
# posexplode

# map
# sql

sql = '''
select id,posexplode(properties) as (pos,key,value)
from dataframe_view
'''

spark.sql(sql).show(truncate = False)

+---+---+----------+---------+
|id |pos|key       |value    |
+---+---+----------+---------+
|1  |0  |name      |John     |
|1  |1  |age       |30       |
|2  |0  |name      |Jane     |
|2  |1  |city      |NYC      |
|3  |0  |score     |100      |
|5  |0  |name      |Mike     |
|5  |1  |role      |developer|
|5  |2  |department|IT       |
|6  |0  |name      |Sarah    |
|6  |1  |city      |Boston   |
|6  |2  |age       |28       |
|7  |0  |level     |senior   |
|7  |1  |status    |active   |
|8  |0  |name      |Tom      |
|8  |1  |experience|5 years  |
|9  |0  |projects  |3        |
|9  |1  |team      |analytics|
|10 |0  |name      |Lisa     |
|10 |1  |age       |35       |
+---+---+----------+---------+



In [None]:
# posexplode

# map
# pyspark with SelectExpr

dataframe.selectExpr('id','posexplode(properties) as (pos,key,value)')\
         .orderBy('id','pos','key','value')\
         .show(truncate = False)

+---+---+----------+---------+
|id |pos|key       |value    |
+---+---+----------+---------+
|1  |0  |name      |John     |
|1  |1  |age       |30       |
|2  |0  |name      |Jane     |
|2  |1  |city      |NYC      |
|3  |0  |score     |100      |
|5  |0  |name      |Mike     |
|5  |1  |role      |developer|
|5  |2  |department|IT       |
|6  |0  |name      |Sarah    |
|6  |1  |city      |Boston   |
|6  |2  |age       |28       |
|7  |0  |level     |senior   |
|7  |1  |status    |active   |
|8  |0  |name      |Tom      |
|8  |1  |experience|5 years  |
|9  |0  |projects  |3        |
|9  |1  |team      |analytics|
|10 |0  |name      |Lisa     |
|10 |1  |age       |35       |
+---+---+----------+---------+



In [None]:
# posexplode

# map
# pyspark with Select + expr

dataframe.select('id',expr('posexplode(properties)').alias('pos','key','value'))\
         .orderBy('id','pos','key','value')\
         .show(truncate = False)

+---+---+----------+---------+
|id |pos|key       |value    |
+---+---+----------+---------+
|1  |0  |name      |John     |
|1  |1  |age       |30       |
|2  |0  |name      |Jane     |
|2  |1  |city      |NYC      |
|3  |0  |score     |100      |
|5  |0  |name      |Mike     |
|5  |1  |role      |developer|
|5  |2  |department|IT       |
|6  |0  |name      |Sarah    |
|6  |1  |city      |Boston   |
|6  |2  |age       |28       |
|7  |0  |level     |senior   |
|7  |1  |status    |active   |
|8  |0  |name      |Tom      |
|8  |1  |experience|5 years  |
|9  |0  |projects  |3        |
|9  |1  |team      |analytics|
|10 |0  |name      |Lisa     |
|10 |1  |age       |35       |
+---+---+----------+---------+



In [None]:
# posexplode_outer : includes the null array and null maps where Normal Explode ignore the null arrays/maps
# array

# map
from pyspark.sql.functions import posexplode_outer, col,expr
dataframe.select('id',posexplode_outer('fruits').alias('pos','fruit'))\
         .orderBy('id','pos','fruit')\
         .show(truncate = False)

# map
from pyspark.sql.functions import posexplode_outer, col,expr
dataframe.select('id',posexplode_outer('properties').alias('pos','key','value'))\
         .orderBy('id','pos','key','value')\
         .show(truncate = False)

+---+----+----------+
|id |pos |fruit     |
+---+----+----------+
|1  |0   |apple     |
|1  |1   |banana    |
|1  |2   |cherry    |
|2  |0   |date      |
|2  |1   |elderberry|
|2  |2   |fig       |
|3  |NULL|NULL      |
|4  |NULL|NULL      |
|5  |0   |grape     |
|5  |1   |honeydew  |
|6  |0   |kiwi      |
|6  |1   |lemon     |
|6  |2   |mango     |
|6  |3   |nectarine |
|7  |0   |orange    |
|8  |0   |peach     |
|8  |1   |quince    |
|9  |0   |raspberry |
|9  |1   |strawberry|
|9  |2   |blueberry |
+---+----+----------+
only showing top 20 rows

+---+----+----------+---------+
|id |pos |key       |value    |
+---+----+----------+---------+
|1  |0   |name      |John     |
|1  |1   |age       |30       |
|2  |0   |name      |Jane     |
|2  |1   |city      |NYC      |
|3  |0   |score     |100      |
|4  |NULL|NULL      |NULL     |
|5  |0   |name      |Mike     |
|5  |1   |role      |developer|
|5  |2   |department|IT       |
|6  |0   |name      |Sarah    |
|6  |1   |city      |Boston   

###### When to Use `posexplode() vs posexplode_outer()`

###### Use `posexplode()`
**When you want to skip empty/null collections**

###### Use `posexplode_outer()`
**When you need to preserve all original rows, even if collections are empty/null**

---
##### `explode() vs posexplode()`
| Aspect | explode() | posexplode() |
|--------|-----------|--------------|
| **Basic Purpose** | Expands collections into rows | Expands collections into rows with position/index |
| **Output Columns** | **Array**: `col`<br>**Map**: `key`, `value` | **Array**: `pos`, `col`<br>**Map**: `pos`, `key`, `value` |
| **Position Info** | ❌ No position information | ✅ Includes 0-based position index |
| **Use Case** | When you only need the values | When you need the original order/position |
| **Array Example** | `[A, B, C]` → `A`, `B`, `C` | `[A, B, C]` → `(0,A)`, `(1,B)`, `(2,C)` |
| **Map Example** | `{k1:v1, k2:v2}` → `(k1,v1)`, `(k2,v2)` | `{k1:v1, k2:v2}` → `(0,k1,v1)`, `(1,k2,v2)` |
| **Order Preservation** | ❌ Loses original order information | ✅ Preserves order (meaningful for arrays) |
| **Performance** | ✅ Slightly faster (less data) | ❌ Slightly slower (more data) |
| **Memory Usage** | ✅ Lower | ❌ Higher (extra position column) |
| **Typical Usage** | Simple flattening of collections | When index/position is important for business logic |

In [None]:
# inline

sql = '''
select *
from dataframe_view
'''
-- dataframe.printSchema()
spark.sql(sql).show(truncate = False)

root
 |-- id: integer (nullable = true)
 |-- fruits: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- scores: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- subject: string (nullable = true)
 |    |    |-- score: integer (nullable = true)

+---+----------------------------------------------+---------------------------------------------------+-------------------------------------------------------------------+
|id |fruits                                        |properties                                         |scores                                                             |
+---+----------------------------------------------+---------------------------------------------------+-------------------------------------------------------------------+
|1  |[apple, banana, cherry]                       |{name -> 

In [None]:
# inline : explodes the array of structs
# sql

sql = '''
select id,scores, inline(scores) as (subject,score)
from dataframe_view
'''
spark.sql(sql).show(truncate = False)

+---+-------------------------------------------------------------------+----------------+-----+
|id |scores                                                             |subject         |score|
+---+-------------------------------------------------------------------+----------------+-----+
|1  |[{math, 90}, {science, 85}]                                        |math            |90   |
|1  |[{math, 90}, {science, 85}]                                        |science         |85   |
|2  |[{history, 78}, {art, 92}]                                         |history         |78   |
|2  |[{history, 78}, {art, 92}]                                         |art             |92   |
|5  |[{programming, 95}, {databases, 88}]                               |programming     |95   |
|5  |[{programming, 95}, {databases, 88}]                               |databases       |88   |
|6  |[{physics, 87}, {chemistry, 91}, {biology, 84}]                    |physics         |87   |
|6  |[{physics, 87}, {chemistr

In [None]:
# inline : explodes the array of structs
# pyspark : function + alias

from pyspark.sql.functions import col,inline
dataframe.select('id','scores', inline(col('scores')).alias('subject','score')).show(truncate = False)

+---+-------------------------------------------------------------------+----------------+-----+
|id |scores                                                             |subject         |score|
+---+-------------------------------------------------------------------+----------------+-----+
|1  |[{math, 90}, {science, 85}]                                        |math            |90   |
|1  |[{math, 90}, {science, 85}]                                        |science         |85   |
|2  |[{history, 78}, {art, 92}]                                         |history         |78   |
|2  |[{history, 78}, {art, 92}]                                         |art             |92   |
|5  |[{programming, 95}, {databases, 88}]                               |programming     |95   |
|5  |[{programming, 95}, {databases, 88}]                               |databases       |88   |
|6  |[{physics, 87}, {chemistry, 91}, {biology, 84}]                    |physics         |87   |
|6  |[{physics, 87}, {chemistr

In [None]:
# inline : explodes the array of structs
# pyspark : select + expr

from pyspark.sql.functions import col,inline, expr
dataframe.select('id','scores', expr('inline(scores) as (subject,score)')).show(truncate = False)

+---+-------------------------------------------------------------------+----------------+-----+
|id |scores                                                             |subject         |score|
+---+-------------------------------------------------------------------+----------------+-----+
|1  |[{math, 90}, {science, 85}]                                        |math            |90   |
|1  |[{math, 90}, {science, 85}]                                        |science         |85   |
|2  |[{history, 78}, {art, 92}]                                         |history         |78   |
|2  |[{history, 78}, {art, 92}]                                         |art             |92   |
|5  |[{programming, 95}, {databases, 88}]                               |programming     |95   |
|5  |[{programming, 95}, {databases, 88}]                               |databases       |88   |
|6  |[{physics, 87}, {chemistry, 91}, {biology, 84}]                    |physics         |87   |
|6  |[{physics, 87}, {chemistr

In [None]:
# inline : explodes the array of structs
# pyspark : select + expr

from pyspark.sql.functions import col,inline, expr
dataframe.selectExpr('id','scores','inline(scores) as (subject,score)').show(truncate = False)

+---+-------------------------------------------------------------------+----------------+-----+
|id |scores                                                             |subject         |score|
+---+-------------------------------------------------------------------+----------------+-----+
|1  |[{math, 90}, {science, 85}]                                        |math            |90   |
|1  |[{math, 90}, {science, 85}]                                        |science         |85   |
|2  |[{history, 78}, {art, 92}]                                         |history         |78   |
|2  |[{history, 78}, {art, 92}]                                         |art             |92   |
|5  |[{programming, 95}, {databases, 88}]                               |programming     |95   |
|5  |[{programming, 95}, {databases, 88}]                               |databases       |88   |
|6  |[{physics, 87}, {chemistry, 91}, {biology, 84}]                    |physics         |87   |
|6  |[{physics, 87}, {chemistr

In [None]:
# inline : explodes the array of structs
# pyspark : select + expr

from pyspark.sql.functions import col,inline, expr
dataframe.selectExpr('id','scores','inline_outer(scores) as (subject,score)').show(truncate = False)

+---+-------------------------------------------------------------------+----------------+-----+
|id |scores                                                             |subject         |score|
+---+-------------------------------------------------------------------+----------------+-----+
|1  |[{math, 90}, {science, 85}]                                        |math            |90   |
|1  |[{math, 90}, {science, 85}]                                        |science         |85   |
|2  |[{history, 78}, {art, 92}]                                         |history         |78   |
|2  |[{history, 78}, {art, 92}]                                         |art             |92   |
|3  |[]                                                                 |NULL            |NULL |
|4  |NULL                                                               |NULL            |NULL |
|5  |[{programming, 95}, {databases, 88}]                               |programming     |95   |
|5  |[{programming, 95}, {data

##### Spark `inline() vs inline_outer()`

| Feature | `inline()` | `inline_outer()` |
|---------|------------|------------------|
| **Empty Arrays** | Drops rows | Keeps rows (null values) |
| **Null Arrays** | Drops rows | Keeps rows (null values) |
| **Output Size** | Fewer rows (filtered) | More rows (all preserved) |
| **Use Case** | Filter out empty data | Keep all original records |

In [None]:
# stack : Melt / Unpivot the data : Convert the Wide Table too Long Table

data = [[1,100,200,300],[2,400,500,600],[3,700,800,900]]
schema = ['id','Q1','Q2','Q3']

wide_dataframe = spark.createDataFrame(data, schema)
wide_dataframe.createOrReplaceTempView('wide_dataframe_view')
wide_dataframe.printSchema()
wide_dataframe.show(truncate = False)

root
 |-- id: long (nullable = true)
 |-- Q1: long (nullable = true)
 |-- Q2: long (nullable = true)
 |-- Q3: long (nullable = true)

+---+---+---+---+
|id |Q1 |Q2 |Q3 |
+---+---+---+---+
|1  |100|200|300|
|2  |400|500|600|
|3  |700|800|900|
+---+---+---+---+



In [None]:
# stack
# sql

sql = '''
select id,
stack(3,
      "Q1",Q1,
      "Q2",Q2,
      "Q3",Q3) as (quarter,sales)
from wide_dataframe_view
'''

spark.sql(sql).show(truncate = False)

+---+-------+-----+
|id |quarter|sales|
+---+-------+-----+
|1  |Q1     |100  |
|1  |Q2     |200  |
|1  |Q3     |300  |
|2  |Q1     |400  |
|2  |Q2     |500  |
|2  |Q3     |600  |
|3  |Q1     |700  |
|3  |Q2     |800  |
|3  |Q3     |900  |
+---+-------+-----+



In [None]:
# stack
# pyspark : selectExpr

wide_dataframe.selectExpr('id','''stack(3,
                                     "Q1",Q1,
                                     "Q2",Q2,
                                     "Q3",Q3) as (quarter,sales)''')\
                                     .show(truncate = False)

+---+-------+-----+
|id |quarter|sales|
+---+-------+-----+
|1  |Q1     |100  |
|1  |Q2     |200  |
|1  |Q3     |300  |
|2  |Q1     |400  |
|2  |Q2     |500  |
|2  |Q3     |600  |
|3  |Q1     |700  |
|3  |Q2     |800  |
|3  |Q3     |900  |
+---+-------+-----+



In [None]:
# stack
# pyspark : function + alias
## Note : ** It does not work here, Select + expr or SelectExpr are best in this case