In [1]:
from pyspark.sql import SparkSession

In [2]:
# to fix error Service 'sparkDriver' could not bind on a random free port. You may check whether configuring an appropriate binding address.
# add below config property .config("spark.driver.bindAddress","127.0.0.1")

spark = SparkSession.builder.appName("array-map-funcs")\
            .config("spark.driver.bindAddress","127.0.0.1")\
            .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/14 12:35:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

In [4]:
from pyspark.sql.types import StructField, StructType, StringType, ArrayType



In [5]:
schema = StructType([
    StructField("name",StringType(), True),
    StructField("lang_at_school",ArrayType(StringType(),False), True),
    StructField("lang_at_work",ArrayType(StringType(),False), True),
    StructField("current_state",StringType(), True),
    StructField("prev_state",StringType(), True)
])

In [6]:
df = spark.createDataFrame(data, schema, verifySchema=True)

In [7]:
df.show()

                                                                                

+----------------+------------------+---------------+-------------+----------+
|            name|    lang_at_school|   lang_at_work|current_state|prev_state|
+----------------+------------------+---------------+-------------+----------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|           OH|        CA|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|           NY|        NJ|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|           UT|        NV|
+----------------+------------------+---------------+-------------+----------+



In [8]:
from pyspark.sql import functions as f

In [9]:
# explode()
# Use explode() function to create a new row for each element in the given array column. 
# There are various PySpark SQL explode functions available to work with Array columns.


df.select(df.name, f.explode(df.lang_at_school)).show()

+----------------+------+
|            name|   col|
+----------------+------+
|    James,,Smith|  Java|
|    James,,Smith| Scala|
|    James,,Smith|   C++|
|   Michael,Rose,| Spark|
|   Michael,Rose,|  Java|
|   Michael,Rose,|   C++|
|Robert,,Williams|CSharp|
|Robert,,Williams|    VB|
+----------------+------+



In [10]:
# Split()

# split() sql function returns an array type after splitting the string column by delimiter. 
# Below example split the name column by comma delimiter.

df.select(df.name.alias("actual_name_as_str"),f.split(df.name, ",").alias("array_of_names")).show()

+------------------+--------------------+
|actual_name_as_str|      array_of_names|
+------------------+--------------------+
|      James,,Smith|    [James, , Smith]|
|     Michael,Rose,|   [Michael, Rose, ]|
|  Robert,,Williams|[Robert, , Williams]|
+------------------+--------------------+



In [11]:
# array()

# Use array() function to create a new array column by merging the data from multiple columns. All input columns must have the same data type. 
# The below example combines the data from currentState and previousState and creates a new column states.

df.select(df.name,f.array(df.current_state, df.prev_state).alias("states_lived")).show()

+----------------+------------+
|            name|states_lived|
+----------------+------------+
|    James,,Smith|    [OH, CA]|
|   Michael,Rose,|    [NY, NJ]|
|Robert,,Williams|    [UT, NV]|
+----------------+------------+



In [12]:
# array_contains()

# array_contains() sql function is used to check if array column contains a value. 
# Returns null if the array is null, true if the array contains the value, and false otherwise.

df.select(f.array_contains(df.lang_at_school, "Java")).show()

+------------------------------------+
|array_contains(lang_at_school, Java)|
+------------------------------------+
|                                true|
|                                true|
|                               false|
+------------------------------------+



In [13]:
df = df.withColumn("all_langs", f.concat(df.lang_at_school, df.lang_at_work))

In [14]:
df.show()

+----------------+------------------+---------------+-------------+----------+--------------------+
|            name|    lang_at_school|   lang_at_work|current_state|prev_state|           all_langs|
+----------------+------------------+---------------+-------------+----------+--------------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|           OH|        CA|[Java, Scala, C++...|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|           NY|        NJ|[Spark, Java, C++...|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|           UT|        NV|[CSharp, VB, Spar...|
+----------------+------------------+---------------+-------------+----------+--------------------+



In [15]:
# Removes duplicates within an array

df.select(df.all_langs, f.array_distinct(df.all_langs).alias("distinct_langs")).show(truncate=False)

+-------------------------------+---------------------------+
|all_langs                      |distinct_langs             |
+-------------------------------+---------------------------+
|[Java, Scala, C++, Spark, Java]|[Java, Scala, C++, Spark]  |
|[Spark, Java, C++, Spark, Java]|[Spark, Java, C++]         |
|[CSharp, VB, Spark, Python]    |[CSharp, VB, Spark, Python]|
+-------------------------------+---------------------------+



In [16]:
# Returns the intersection of two arrays without duplicates

df.select(df.name, f.array_intersect(df.lang_at_school, df.lang_at_work).alias("langs_at_school_and_work")).show(truncate=False)

+----------------+------------------------+
|name            |langs_at_school_and_work|
+----------------+------------------------+
|James,,Smith    |[Java]                  |
|Michael,Rose,   |[Spark, Java]           |
|Robert,,Williams|[]                      |
+----------------+------------------------+



In [17]:
# Returns the union of two arrays without duplicates

df.select(df.name, f.array_union(df.lang_at_school, df.lang_at_work).alias("all_langs_unique")).show(truncate=False)

+----------------+---------------------------+
|name            |all_langs_unique           |
+----------------+---------------------------+
|James,,Smith    |[Java, Scala, C++, Spark]  |
|Michael,Rose,   |[Spark, Java, C++]         |
|Robert,,Williams|[CSharp, VB, Spark, Python]|
+----------------+---------------------------+



In [18]:
# Returns elements in array1 but not in array2, without duplicates

df.select(df.name, f.array_except(df.lang_at_school, df.lang_at_work).alias("langs_at_school_and_not_in_work")).show(truncate=False)

+----------------+-------------------------------+
|name            |langs_at_school_and_not_in_work|
+----------------+-------------------------------+
|James,,Smith    |[Scala, C++]                   |
|Michael,Rose,   |[C++]                          |
|Robert,,Williams|[CSharp, VB]                   |
+----------------+-------------------------------+



In [19]:
# Concatenates the elements of an array using a delimiter

df.select(df.name, f.array_join(df.lang_at_school, "--").alias("lang_at_school")).show(truncate=False)

+----------------+----------------+
|name            |lang_at_school  |
+----------------+----------------+
|James,,Smith    |Java--Scala--C++|
|Michael,Rose,   |Spark--Java--C++|
|Robert,,Williams|CSharp--VB      |
+----------------+----------------+



In [20]:
# Returns the maximum value within the array; null elements are skipped

df.select(df.name, df.lang_at_school, f.array_max(df.lang_at_school).alias("lang_at_school_array_max")).show(truncate=False)

+----------------+------------------+------------------------+
|name            |lang_at_school    |lang_at_school_array_max|
+----------------+------------------+------------------------+
|James,,Smith    |[Java, Scala, C++]|Scala                   |
|Michael,Rose,   |[Spark, Java, C++]|Spark                   |
|Robert,,Williams|[CSharp, VB]      |VB                      |
+----------------+------------------+------------------------+



In [21]:
# Returns the minimum value within the array; null elements are skipped

df.select(df.name, df.lang_at_school, f.array_min(df.lang_at_school).alias("lang_at_school_array_min")).show(truncate=False)

+----------------+------------------+------------------------+
|name            |lang_at_school    |lang_at_school_array_min|
+----------------+------------------+------------------------+
|James,,Smith    |[Java, Scala, C++]|C++                     |
|Michael,Rose,   |[Spark, Java, C++]|C++                     |
|Robert,,Williams|[CSharp, VB]      |CSharp                  |
+----------------+------------------+------------------------+



In [22]:
# Returns the (1-based) index of the first element of the given array as a Long

In [23]:
df.select(df.lang_at_school, f.array_position(df.lang_at_school, "Java")).show()

+------------------+------------------------------------+
|    lang_at_school|array_position(lang_at_school, Java)|
+------------------+------------------------------------+
|[Java, Scala, C++]|                                   1|
|[Spark, Java, C++]|                                   2|
|      [CSharp, VB]|                                   0|
+------------------+------------------------------------+



In [24]:
# Removes all elements that are equal to the given element from the given array

df.select(df.all_langs, f.array_remove(df.all_langs, "Java")).show(truncate=False)

+-------------------------------+-----------------------------+
|all_langs                      |array_remove(all_langs, Java)|
+-------------------------------+-----------------------------+
|[Java, Scala, C++, Spark, Java]|[Scala, C++, Spark]          |
|[Spark, Java, C++, Spark, Java]|[Spark, C++, Spark]          |
|[CSharp, VB, Spark, Python]    |[CSharp, VB, Spark, Python]  |
+-------------------------------+-----------------------------+



In [25]:
# eturns true if array1 contains at least one non-null element also present in array2

df.select(df.lang_at_school, df.lang_at_work,f.arrays_overlap(df.lang_at_school, df.lang_at_work).alias("lang_overlap")).show()

+------------------+---------------+------------+
|    lang_at_school|   lang_at_work|lang_overlap|
+------------------+---------------+------------+
|[Java, Scala, C++]|  [Spark, Java]|        true|
|[Spark, Java, C++]|  [Spark, Java]|        true|
|      [CSharp, VB]|[Spark, Python]|       false|
+------------------+---------------+------------+



In [26]:
# Sorts the input array in ascending order, with null elements placed at the end of the array

df.select(df.lang_at_school, f.array_sort(df.lang_at_school).alias("sorted_langs")).show()

+------------------+------------------+
|    lang_at_school|      sorted_langs|
+------------------+------------------+
|[Java, Scala, C++]|[C++, Java, Scala]|
|[Spark, Java, C++]|[C++, Java, Spark]|
|      [CSharp, VB]|      [CSharp, VB]|
+------------------+------------------+



In [27]:
# Concatenates strings, binaries, arrays, etc.

df.select(df.lang_at_school, df.lang_at_work, f.concat(df.lang_at_school, df.lang_at_work)).show(truncate=False)

+------------------+---------------+------------------------------------+
|lang_at_school    |lang_at_work   |concat(lang_at_school, lang_at_work)|
+------------------+---------------+------------------------------------+
|[Java, Scala, C++]|[Spark, Java]  |[Java, Scala, C++, Spark, Java]     |
|[Spark, Java, C++]|[Spark, Java]  |[Spark, Java, C++, Spark, Java]     |
|[CSharp, VB]      |[Spark, Python]|[CSharp, VB, Spark, Python]         |
+------------------+---------------+------------------------------------+



In [28]:
# creating array of arrays to implement flatten func

df = df.withColumn("array_of_langs_array", f.array(df.lang_at_school, df.lang_at_work))
df.select(df.array_of_langs_array).show(truncate=False)

+-----------------------------------+
|array_of_langs_array               |
+-----------------------------------+
|[[Java, Scala, C++], [Spark, Java]]|
|[[Spark, Java, C++], [Spark, Java]]|
|[[CSharp, VB], [Spark, Python]]    |
+-----------------------------------+



In [29]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- lang_at_school: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- lang_at_work: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- current_state: string (nullable = true)
 |-- prev_state: string (nullable = true)
 |-- all_langs: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- array_of_langs_array: array (nullable = false)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = false)



In [30]:
# Flattens an array of arrays into a single array

df.select(df.array_of_langs_array, f.flatten(df.array_of_langs_array)).show(truncate=False)

+-----------------------------------+-------------------------------+
|array_of_langs_array               |flatten(array_of_langs_array)  |
+-----------------------------------+-------------------------------+
|[[Java, Scala, C++], [Spark, Java]]|[Java, Scala, C++, Spark, Java]|
|[[Spark, Java, C++], [Spark, Java]]|[Spark, Java, C++, Spark, Java]|
|[[CSharp, VB], [Spark, Python]]    |[CSharp, VB, Spark, Python]    |
+-----------------------------------+-------------------------------+



In [31]:
# Returns an array containing the specified element the specified number of times

# df.withColumn("repeat_vals",f.array_repeat("spark", 3))

# above line of code throws error
# explnation:
# In PySpark, when using array_repeat within a withColumn method, the first argument of array_repeat must be a column, not a literal string.
# To fix this, you can use F.lit() to convert your string "spark" into a literal

df.withColumn("repeat_vals",f.array_repeat(f.lit("spark"), 3)).select("repeat_vals").show(truncate=False)

+---------------------+
|repeat_vals          |
+---------------------+
|[spark, spark, spark]|
|[spark, spark, spark]|
|[spark, spark, spark]|
+---------------------+



In [32]:
# Returns a reversed string or an array with the reverse order of elements

df.select(df.all_langs, f.reverse(df.all_langs)).show(truncate=False)

+-------------------------------+-------------------------------+
|all_langs                      |reverse(all_langs)             |
+-------------------------------+-------------------------------+
|[Java, Scala, C++, Spark, Java]|[Java, Spark, C++, Scala, Java]|
|[Spark, Java, C++, Spark, Java]|[Java, Spark, C++, Java, Spark]|
|[CSharp, VB, Spark, Python]    |[Python, Spark, VB, CSharp]    |
+-------------------------------+-------------------------------+



In [33]:
# Generates an array of elements from start to stop (inclusive) by incremental step

df.select(f.sequence(f.lit(1),f.lit(10),f.lit(2))).show()

+------------------+
|sequence(1, 10, 2)|
+------------------+
|   [1, 3, 5, 7, 9]|
|   [1, 3, 5, 7, 9]|
|   [1, 3, 5, 7, 9]|
+------------------+



In [34]:
# Returns a random permutation of the given array

In [35]:
df.select(df.all_langs, f.shuffle(df.all_langs)).show(truncate=False)

+-------------------------------+-------------------------------+
|all_langs                      |shuffle(all_langs)             |
+-------------------------------+-------------------------------+
|[Java, Scala, C++, Spark, Java]|[Java, Scala, Java, Spark, C++]|
|[Spark, Java, C++, Spark, Java]|[Spark, Spark, C++, Java, Java]|
|[CSharp, VB, Spark, Python]    |[VB, Spark, Python, CSharp]    |
+-------------------------------+-------------------------------+



In [36]:
# Returns a subset of the given array starting from the given index (counting from the end if the index is negative), of the specified length
# Signature:
# f.slice(
#     x: 'ColumnOrName',
#     start: Union[ForwardRef('ColumnOrName'), int],
#     length: Union[ForwardRef('ColumnOrName'), int],
# ) -> pyspark.sql.column.Column


df.select(df.all_langs, f.slice(df.all_langs, 2, 5)).show(truncate=False)
df.select(df.all_langs, f.slice(df.all_langs, 2, 2)).show(truncate=False)

+-------------------------------+-------------------------+
|all_langs                      |slice(all_langs, 2, 5)   |
+-------------------------------+-------------------------+
|[Java, Scala, C++, Spark, Java]|[Scala, C++, Spark, Java]|
|[Spark, Java, C++, Spark, Java]|[Java, C++, Spark, Java] |
|[CSharp, VB, Spark, Python]    |[VB, Spark, Python]      |
+-------------------------------+-------------------------+

+-------------------------------+----------------------+
|all_langs                      |slice(all_langs, 2, 2)|
+-------------------------------+----------------------+
|[Java, Scala, C++, Spark, Java]|[Scala, C++]          |
|[Spark, Java, C++, Spark, Java]|[Java, C++]           |
|[CSharp, VB, Spark, Python]    |[VB, Spark]           |
+-------------------------------+----------------------+



In [37]:
# returns a merged array of structs

spark.sql("SELECT arrays_zip(array(1, 2),array(2, 3), array(3, 4))").show(truncate=False)

+-------------------------------------------------+
|arrays_zip(array(1, 2), array(2, 3), array(3, 4))|
+-------------------------------------------------+
|[{1, 2, 3}, {2, 3, 4}]                           |
+-------------------------------------------------+



In [38]:
# Returns the element of the given array at the given (1-based) index

In [39]:
df.select(df.lang_at_school, f.element_at(df.lang_at_school, -1)).show()

+------------------+------------------------------+
|    lang_at_school|element_at(lang_at_school, -1)|
+------------------+------------------------------+
|[Java, Scala, C++]|                           C++|
|[Spark, Java, C++]|                           C++|
|      [CSharp, VB]|                            VB|
+------------------+------------------------------+



In [40]:
# An alias of size; returns the size of the given array or a map

df.select(df.all_langs, f.cardinality(f.array_distinct(df.all_langs)).alias("num_of_unique_langs")).show(truncate=False)

+-------------------------------+-------------------+
|all_langs                      |num_of_unique_langs|
+-------------------------------+-------------------+
|[Java, Scala, C++, Spark, Java]|4                  |
|[Spark, Java, C++, Spark, Java]|3                  |
|[CSharp, VB, Spark, Python]    |4                  |
+-------------------------------+-------------------+



In [41]:
# map_form_arrays(array<K>,array<V>): map<K, V> Creates a map from the given pair of key/value arrays; elements in keys should not be null

data2 = [
    [["name", "state"], ["peter", "NY"]],
    [["name", "job"], ["peter", "fireman"]]
]

schema2 = StructType([
    StructField("keys",ArrayType(StringType(), True)),
    StructField("values",ArrayType(StringType(), True))
])

In [42]:
df2 = spark.createDataFrame(data2, schema2, verifySchema=True)

In [43]:
df2.show()

+-------------+----------------+
|         keys|          values|
+-------------+----------------+
|[name, state]|     [peter, NY]|
|  [name, job]|[peter, fireman]|
+-------------+----------------+



In [44]:
df2 = df2.withColumn("mapped_data", f.map_from_arrays(df2.keys, df2.values))

In [45]:
df2.show(truncate=False)

+-------------+----------------+-------------------------------+
|keys         |values          |mapped_data                    |
+-------------+----------------+-------------------------------+
|[name, state]|[peter, NY]     |{name -> peter, state -> NY}   |
|[name, job]  |[peter, fireman]|{name -> peter, job -> fireman}|
+-------------+----------------+-------------------------------+



In [59]:
from pyspark.sql import types as t

In [72]:
# # this is not solved, having issue implementing this func

# # map_from_entries(array<struct<K, V>>): map<K, V>   Returns a map created from the given array

# entries = [("name", "peter"), ("state", "NY"),]

# schema = StructType([
#     StructField("data", t.Row(t.StringType()))
# ])
# df3 = spark.createDataFrame(entries, ["data"])
# df3.show()

# # # spark.createDataFrame(f.map_from_entries(df3._1)).show()

AssertionError: dataType <Row(StringType())> should be an instance of <class 'pyspark.sql.types.DataType'>

## Higher-Order Functions
In addition to the previously noted built-in functions, there are higher-order functions
that take anonymous lambda functions as arguments. An example of a higherorder
function is the following:

-- In SQL
### `transform(values, value -> lambda expression)`

The transform() function takes an array (values) and anonymous function (lambda
expression) as input. The function transparently creates a new array by applying the
anonymous function to each element, and then assigning the result to the output
array (similar to the UDF approach, but more efficiently).

In [75]:
# Let’s create a sample data set so we can run some examples:

from pyspark.sql.types import *

schema = StructType([
    StructField("celsius", ArrayType(IntegerType()))
])
t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]

df4 = spark.createDataFrame(t_list, schema)
df4.createOrReplaceTempView("celsius")

In [78]:
spark.sql("select celsius, transform(celsius, t -> t-5) as reduced_temp from celsius").show(truncate=False)

+----------------------------+----------------------------+
|celsius                     |reduced_temp                |
+----------------------------+----------------------------+
|[35, 36, 32, 30, 40, 42, 38]|[30, 31, 27, 25, 35, 37, 33]|
|[31, 32, 34, 55, 56]        |[26, 27, 29, 50, 51]        |
+----------------------------+----------------------------+



### `filter(array<T>, function<T, Boolean>): array<T>`
The filter() function produces an array consisting of only the elements of the input
array for which the Boolean function is true:

In [86]:
# filter temp more than 40 c

spark.sql("select celsius, filter(celsius, t -> t > 40) as filtered from celsius").show()

+--------------------+--------+
|             celsius|filtered|
+--------------------+--------+
|[35, 36, 32, 30, ...|    [42]|
|[31, 32, 34, 55, 56]|[55, 56]|
+--------------------+--------+



### `exists(array<T>, function<T, V, Boolean>): Boolean`
The exists() function returns true if the Boolean function holds for any element in
the input array:

In [89]:
# // Is there a temperature of 38C in the array of temperatures

spark.sql("select celsius, exists(celsius, t -> t = 38) as exists from celsius").show()

+--------------------+------+
|             celsius|exists|
+--------------------+------+
|[35, 36, 32, 30, ...|  true|
|[31, 32, 34, 55, 56]| false|
+--------------------+------+



### `reduce(array<T>, B, function<B, T, B>, function<B, R>)`
The reduce() function reduces the elements of the array to a single value by merging
the elements into a buffer B using function<B, T, B> and applying a finishing
function<B, R> on the final buffer: