<a href="https://colab.research.google.com/github/rahulrajpr/prepare-anytime/blob/main/spark/functions/7_spark_sql_array_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spark Array Functions
https://spark.apache.org/docs/latest/sql-ref-functions-builtin.html#array-functions

> An **Array** in **Apache Spark** is a **complex column type** that allows you to represent **a collection of multiple values** within a **single DataFrame column**.  
> It enables storing **ordered elements of the same data type** inside one column, providing a **flexible and efficient way** to manage and query **multi-valued or semi-structured data** such as **lists, tags, or JSON arrays**.


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-functions').getOrCreate()

In [None]:
# array : to create an array of elements

sql = '''select array(1,2,3,4,5) as arrayOut'''
spark.sql(sql).show(truncate = False)

+---------------+
|arrayOut       |
+---------------+
|[1, 2, 3, 4, 5]|
+---------------+



In [None]:
# array_size

sql = '''
with cte as
(
select
  array(1,2,3) as arrayOut
)
select
  arrayOut,
  array_size(arrayOut) array_sizeOut
from cte
'''
spark.sql(sql).show(truncate = False)

+---------+-------------+
|arrayOut |array_sizeOut|
+---------+-------------+
|[1, 2, 3]|3            |
+---------+-------------+



In [None]:
# array_append : append an array with an element

sql = '''
with cte as
(
select
  array(1,2,3) as arrayOut
)
select
  arrayOut,
  array_append(arrayOut,4) appendedArrayOutput
from cte
'''
spark.sql(sql).show(truncate = False)

+---------+-------------------+
|arrayOut |appendedArrayOutput|
+---------+-------------------+
|[1, 2, 3]|[1, 2, 3, 4]       |
+---------+-------------------+



In [None]:
# array_prepend : prepend an array with an element

sql = '''
with cte as
(
select
  array(1,2,3) as arrayOut
)
select
  arrayOut,
  array_prepend(arrayOut,4) array_prependOut
from cte
'''
spark.sql(sql).show(truncate = False)

+---------+----------------+
|arrayOut |array_prependOut|
+---------+----------------+
|[1, 2, 3]|[4, 1, 2, 3]    |
+---------+----------------+



In [None]:
# array_remove

sql = '''
with cte as
(
select
  array(1,2,3,4,5,5,4) as arrayOut
)
select
  arrayOut,
  array_remove(arrayOut,4) array_prependOut
from cte
'''
spark.sql(sql).show(truncate = False)

+---------------------+----------------+
|arrayOut             |array_prependOut|
+---------------------+----------------+
|[1, 2, 3, 4, 5, 5, 4]|[1, 2, 3, 5, 5] |
+---------------------+----------------+



In [None]:
# array_compact : removes the null elements from an array

sql = '''
with cte as
(
select
  array(NULL,1,2,NULL,3) as arrayOut
)
select
  arrayOut,
  array_compact(arrayOut) compactArrayOutput
from cte
'''
spark.sql(sql).show(truncate = False)

+---------------------+------------------+
|arrayOut             |compactArrayOutput|
+---------------------+------------------+
|[NULL, 1, 2, NULL, 3]|[1, 2, 3]         |
+---------------------+------------------+



In [None]:
# array_contains : check an elements in an array

sql = '''
with cte as
(
select
  array(NULL,1,2,NULL,3) as arrayOut
)
select
  arrayOut,
  array_contains(arrayOut,2) arrayContainsOut
from cte
'''
spark.sql(sql).show(truncate = False)

+---------------------+----------------+
|arrayOut             |arrayContainsOut|
+---------------------+----------------+
|[NULL, 1, 2, NULL, 3]|true            |
+---------------------+----------------+



In [None]:
# array_distinct : get the distinct values of an array

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  array_distinct(arrayOut) arrayDistinctOut
from cte
'''
spark.sql(sql).show(truncate = False)

+------------------+----------------+
|arrayOut          |arrayDistinctOut|
+------------------+----------------+
|[1, 1, 2, 2, 3, 4]|[1, 2, 3, 4]    |
+------------------+----------------+



In [None]:
# array_union

sql = '''
with cte as
(
select
    array(1,2,3,4) as array1,
    array(10,20) as array2
)
select
    array1,
    array2,
    array_union(array1,array2) as array_unionOut
from cte
'''
spark.sql(sql).show(truncate = False)

+------------+--------+--------------------+
|array1      |array2  |array_unionOut      |
+------------+--------+--------------------+
|[1, 2, 3, 4]|[10, 20]|[1, 2, 3, 4, 10, 20]|
+------------+--------+--------------------+



In [None]:
# array_intersect

sql = '''
with cte as
(
select
    array(1,2,3,4) as array1,
    array(1,2) as array2
)
select
    array1,
    array2,
    array_intersect(array1,array2) as array_intersectOut
from cte
'''
spark.sql(sql).show(truncate = False)

+------------+------+------------------+
|array1      |array2|array_intersectOut|
+------------+------+------------------+
|[1, 2, 3, 4]|[1, 2]|[1, 2]            |
+------------+------+------------------+



In [None]:
# arrays_overlap

sql = '''
with cte as
(
select
    array(1,2,3,4) as array1,
    array(1,2) as array2
)
select
    array1,
    array2,
    arrays_overlap(array1,array2) as array_overlapOut
from cte
'''
spark.sql(sql).show(truncate = False)

+------------+------+----------------+
|array1      |array2|array_overlapOut|
+------------+------+----------------+
|[1, 2, 3, 4]|[1, 2]|true            |
+------------+------+----------------+



In [None]:
# array_except : get the except values from array1 to array2 like in set operation

sql = '''
with cte as
(
select
    array(1,2,3,4) as array1,
    array(1,2) as array2
)
select
    array1,
    array2,
    array_except(array1,array2) as arrayExceptOut
from cte
'''
spark.sql(sql).show(truncate = False)

+------------+------+--------------+
|array1      |array2|arrayExceptOut|
+------------+------+--------------+
|[1, 2, 3, 4]|[1, 2]|[3, 4]        |
+------------+------+--------------+



In [None]:
# array_insert

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  array_insert(arrayOut,2,100) array_insertOut
from cte
'''
spark.sql(sql).show(truncate = False)

+------------------+-----------------------+
|arrayOut          |array_insertOut        |
+------------------+-----------------------+
|[1, 1, 2, 2, 3, 4]|[1, 100, 1, 2, 2, 3, 4]|
+------------------+-----------------------+



In [None]:
# array_join

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  array_join(arrayOut,'-') as array_joinOut
from cte
'''
spark.sql(sql).show(truncate = False)

sql = '''
with cte as
(
select
  array(1,NULL,2,NULL,3,4) as arrayOut
)
select
  arrayOut,
  array_join(arrayOut,'-') as array_joinOut
from cte
'''
spark.sql(sql).show(truncate = False)

sql = '''
with cte as
(
select
  array(1,NULL,2,NULL,3,4) as arrayOut
)
select
  arrayOut,
  array_join(arrayOut,'-','missing') as array_joinOut
from cte
'''
spark.sql(sql).show(truncate = False)

+------------------+-------------+
|arrayOut          |array_joinOut|
+------------------+-------------+
|[1, 1, 2, 2, 3, 4]|1-1-2-2-3-4  |
+------------------+-------------+

+------------------------+-------------+
|arrayOut                |array_joinOut|
+------------------------+-------------+
|[1, NULL, 2, NULL, 3, 4]|1-2-3-4      |
+------------------------+-------------+

+------------------------+-----------------------+
|arrayOut                |array_joinOut          |
+------------------------+-----------------------+
|[1, NULL, 2, NULL, 3, 4]|1-missing-2-missing-3-4|
+------------------------+-----------------------+



In [None]:
# array_max

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  array_max(arrayOut) as array_maxOut
from cte
'''
spark.sql(sql).show(truncate = False)

+------------------+------------+
|arrayOut          |array_maxOut|
+------------------+------------+
|[1, 1, 2, 2, 3, 4]|4           |
+------------------+------------+



In [None]:
# array_min

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  array_min(arrayOut) as array_minOut
from cte
'''
spark.sql(sql).show(truncate = False)

+------------------+------------+
|arrayOut          |array_minOut|
+------------------+------------+
|[1, 1, 2, 2, 3, 4]|1           |
+------------------+------------+



In [None]:
# extracting an item from an array using []

## if the index position does not match it will return NULL

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  arrayOut[1] as arrayItem
from cte
'''
spark.sql(sql).show(truncate = False)

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  arrayOut[10] as arrayItem
from cte
'''
spark.sql(sql).show(truncate = False)

+------------------+---------+
|arrayOut          |arrayItem|
+------------------+---------+
|[1, 1, 2, 2, 3, 4]|1        |
+------------------+---------+

+------------------+---------+
|arrayOut          |arrayItem|
+------------------+---------+
|[1, 1, 2, 2, 3, 4]|NULL     |
+------------------+---------+



In [None]:
# get
# if the index does not match it will return NULL

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  get(arrayOut,1) as arrayItem
from cte
'''
spark.sql(sql).show(truncate = False)

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  get(arrayOut,10) as arrayItem
from cte
'''
spark.sql(sql).show(truncate = False)

+------------------+---------+
|arrayOut          |arrayItem|
+------------------+---------+
|[1, 1, 2, 2, 3, 4]|1        |
+------------------+---------+

+------------------+---------+
|arrayOut          |arrayItem|
+------------------+---------+
|[1, 1, 2, 2, 3, 4]|NULL     |
+------------------+---------+



In [None]:
# array_position

# in case of the index positio  does not match, it will [[[[ return 0 ]]]] , >>>> (NOT NULL)

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  array_position(arrayOut,1) as arrayItem
from cte
'''
spark.sql(sql).show(truncate = False)

sql = '''
with cte as
(
select
  array(1,1,2,2,3,4) as arrayOut
)
select
  arrayOut,
  array_position(arrayOut,10) as arrayItem
from cte
'''
spark.sql(sql).show(truncate = False)

+------------------+---------+
|arrayOut          |arrayItem|
+------------------+---------+
|[1, 1, 2, 2, 3, 4]|1        |
+------------------+---------+

+------------------+---------+
|arrayOut          |arrayItem|
+------------------+---------+
|[1, 1, 2, 2, 3, 4]|0        |
+------------------+---------+



In [None]:
|}# array_repeat

sql = '''
with cte as
(
select
  array(1,2,3) as arrayOut
)
select
  arrayOut,
  array_repeat(arrayOut,3) as array_repeatOut
from cte
'''
spark.sql(sql).show(truncate = False)

+---------+---------------------------------+
|arrayOut |array_repeatOut                  |
+---------+---------------------------------+
|[1, 2, 3]|[[1, 2, 3], [1, 2, 3], [1, 2, 3]]|
+---------+---------------------------------+



In [None]:
# arrays_zip

sql = '''
with cte as
(
select
  array(1,2,3) as array1,
  array(1,2,3) as array2
)
select
  array1,
  array2,
  arrays_zip(array1,array2) as array_zipOut
from cte
'''
spark.sql(sql).show(truncate = False)

+---------+---------+------------------------+
|array1   |array2   |array_zipOut            |
+---------+---------+------------------------+
|[1, 2, 3]|[1, 2, 3]|[{1, 1}, {2, 2}, {3, 3}]|
+---------+---------+------------------------+



In [None]:
# flatten

sql = '''
with cte as
(
  select array(array(1),array(2),array(4,5)) as array1
)
select array1,
flatten(array1) as arrayflattenOut
from cte'''

spark.sql(sql).show(truncate = False)

+------------------+---------------+
|array1            |arrayflattenOut|
+------------------+---------------+
|[[1], [2], [4, 5]]|[1, 2, 4, 5]   |
+------------------+---------------+



In [None]:
# sequence : generate an array from start end and step count

sql = '''
with cte as
(
select sequence(1,12,2) as generatedArray
)
select generatedArray, TypeOf(generatedArray) as typeOut
from cte
'''
spark.sql(sql).show(truncate = False)

##---

sql = '''
with cte as
(
select sequence(10,0,-1) as generatedArray
)
select generatedArray, TypeOf(generatedArray) as typeOut
from cte
'''
spark.sql(sql).show(truncate = False)

+-------------------+----------+
|generatedArray     |typeOut   |
+-------------------+----------+
|[1, 3, 5, 7, 9, 11]|array<int>|
+-------------------+----------+

+----------------------------------+----------+
|generatedArray                    |typeOut   |
+----------------------------------+----------+
|[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]|array<int>|
+----------------------------------+----------+



In [None]:
# shuffle : create a random pormutation of the given array

sql = '''
with cte as
(
select sequence(10,0,-1) as generatedArray
)
select generatedArray, shuffle(generatedArray) as shuffleOut
from cte
'''
spark.sql(sql).show(truncate = False)

+----------------------------------+----------------------------------+
|generatedArray                    |shuffleOut                        |
+----------------------------------+----------------------------------+
|[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]|[3, 0, 4, 1, 2, 8, 6, 5, 10, 7, 9]|
+----------------------------------+----------------------------------+



In [None]:
# slice : to slice an array with start index and length

sql = '''
with cte as
(
select sequence(10,0,-1) as generatedArray
)
select generatedArray, slice(generatedArray,3,5) as sliceOut
from cte
'''
spark.sql(sql).show(truncate = False)

+----------------------------------+---------------+
|generatedArray                    |sliceOut       |
+----------------------------------+---------------+
|[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]|[8, 7, 6, 5, 4]|
+----------------------------------+---------------+



In [None]:
# sort_array : sorting an array ascending and descending

sql = '''
with cte as
(
select shuffle(sequence(10,0,-1)) as generatedArray
)
select generatedArray,
sort_array(generatedArray) as sort_arrayOutASC,
sort_array(generatedArray,False) as sort_arrayOutDESC
from cte
'''
spark.sql(sql).show(truncate = False)

+----------------------------------+----------------------------------+----------------------------------+
|generatedArray                    |sort_arrayOutASC                  |sort_arrayOutDESC                 |
+----------------------------------+----------------------------------+----------------------------------+
|[5, 6, 4, 9, 0, 8, 2, 10, 3, 1, 7]|[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]|[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]|
+----------------------------------+----------------------------------+----------------------------------+

