In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

from pyspark.sql.functions import *
from pyspark.sql.types import *

#### PySpark MapType

PySpark MapType is used to represent map key-value pair similar to python Dictionary (Dict), it takes two mandatory arguments `keyType` and `valueType` of type DataType and one optional boolean argument `valueContainsNull`. keyType and valueType can be any type that extends the DataType class. for e.g StringType, IntegerType, ArrayType, MapType, StructType (struct) e.t.c.

* The First param keyType is used to specify the type of the key in the map.
* The Second param valueType is used to specify the type of the value in the map.
* Third parm valueContainsNull is an optional boolean type that is used to specify if the value of the second param can accept Null/None values.
* The key of the map won’t accept None/Null values.

In [2]:
schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties', MapType(StringType(),StringType()),True)
])

#create a DataFrame by using above StructType schema
dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]
df1 = spark.createDataFrame(data=dataDictionary, schema = schema)
df1.printSchema()
df1.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+-----------------------------+
|name      |properties                   |
+----------+-----------------------------+
|James     |[eye -> brown, hair -> black]|
|Michael   |[eye ->, hair -> brown]      |
|Robert    |[eye -> black, hair -> red]  |
|Washington|[eye -> grey, hair -> grey]  |
|Jefferson |[eye -> , hair -> brown]     |
+----------+-----------------------------+



In [3]:
df2 = df1.rdd.map(lambda x: (x.name, x.properties["hair"], x.properties["eye"])) \
      .toDF(["name","hair","eye"])

df2.printSchema()

df2.show()

root
 |-- name: string (nullable = true)
 |-- hair: string (nullable = true)
 |-- eye: string (nullable = true)

+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



In [4]:
df1.withColumn("hair",df1.properties.getItem("hair")) \
  .withColumn("eye",df1.properties.getItem("eye")) \
  .drop("properties") \
  .show()

df1.withColumn("hair",df1.properties["hair"]) \
  .withColumn("eye",df1.properties["eye"]) \
  .drop("properties") \
  .show()

+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+

+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



## MapType Functions

___explode(col)___

Returns a new row for each element in the given array or map. Uses the __default column name col for elements in the array__ and __key and value for elements in the map__ unless specified otherwise.

In [5]:
df1.select(df1.name,explode(df1.properties)).show()

+----------+----+-----+
|      name| key|value|
+----------+----+-----+
|     James| eye|brown|
|     James|hair|black|
|   Michael| eye| null|
|   Michael|hair|brown|
|    Robert| eye|black|
|    Robert|hair|  red|
|Washington| eye| grey|
|Washington|hair| grey|
| Jefferson| eye|     |
| Jefferson|hair|brown|
+----------+----+-----+



___map_keys() – Get All Map Keys___

In [6]:
df1.select(df1.name,map_keys(df1.properties)).show()

+----------+--------------------+
|      name|map_keys(properties)|
+----------+--------------------+
|     James|         [eye, hair]|
|   Michael|         [eye, hair]|
|    Robert|         [eye, hair]|
|Washington|         [eye, hair]|
| Jefferson|         [eye, hair]|
+----------+--------------------+



___In case if you wanted to get all map keys as Python List.___ 
#### WARNING: This runs very slow.

In [7]:
keysDf = df1.select(explode(map_keys(df1.properties))).distinct()

keysDf.show()

keysDf.rdd.map(lambda x: x[0]).collect()

+----+
| col|
+----+
| eye|
|hair|
+----+



['eye', 'hair']

___map_values() – Get All map Values___

In [8]:
df1.select("name", map_values(df1.properties)).show()

+----------+----------------------+
|      name|map_values(properties)|
+----------+----------------------+
|     James|        [brown, black]|
|   Michael|             [, brown]|
|    Robert|          [black, red]|
|Washington|          [grey, grey]|
| Jefferson|             [, brown]|
+----------+----------------------+



___map_concat()___

In [9]:
df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') as map2")

df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False)

+------------------------+
|map3                    |
+------------------------+
|[1 -> a, 2 -> b, 3 -> c]|
+------------------------+



### Q1.
Input data
```
-------------------------
| Brand | type |  amount|
-------------------------
|  B   |   a  |   10   |
|  B   |   b  |   20   |
|  C   |   c  |   30   |
-------------------------
```
Output data
```
-------------------------
| Brand | MAP_type_AMOUNT 
-------------------------
|  B    | {a: 10, b:20} |
|  C    | {c: 30}       |
-------------------------
```

In [10]:
data = [('B', 'a', 10),
       ('B', 'b', 20),
       ('C', 'c', 30)]

In [11]:
df3 = spark.createDataFrame(data, ['Brand', 'type', 'amount'])

df3.show()

+-----+----+------+
|Brand|type|amount|
+-----+----+------+
|    B|   a|    10|
|    B|   b|    20|
|    C|   c|    30|
+-----+----+------+



In [12]:
df_converted1 = df3.groupBy("Brand"). \
    agg(collect_list(struct(col("Type"), col("Amount"))).alias("MAP_type_AMOUNT"))

df_converted1.show()

+-----+------------------+
|Brand|   MAP_type_AMOUNT|
+-----+------------------+
|    B|[[a, 10], [b, 20]]|
|    C|         [[c, 30]]|
+-----+------------------+



In [13]:
df_converted21 = (
    df3.groupBy('Brand')
    .agg(
        collect_list('type').alias('type'),
        collect_list('amount').alias('amount'),
    )
)

df_converted21.show()

+-----+------+--------+
|Brand|  type|  amount|
+-----+------+--------+
|    B|[b, a]|[20, 10]|
|    C|   [c]|    [30]|
+-----+------+--------+



In [14]:
df_converted2 = (
    df3.groupBy('Brand')
    .agg(
        collect_list('type').alias('type'),
        collect_list('amount').alias('amount'),
    )
    .withColumn('MAP_type_AMOUNT', map_from_arrays('type', 'amount'))
    .drop('type', 'amount')
)

df_converted2.show()

df_converted2.printSchema()

+-----+------------------+
|Brand|   MAP_type_AMOUNT|
+-----+------------------+
|    B|[a -> 10, b -> 20]|
|    C|         [c -> 30]|
+-----+------------------+

root
 |-- Brand: string (nullable = true)
 |-- MAP_type_AMOUNT: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = false)



## Array functions

In [15]:
d1 = [("x", 4, 1),
  ("x", 6, 2),
  ("z", 7, 3),
  ("a", 3, 4),
  ("z", 5, 2),
  ("x", 7, 3),
  ("x", 9, 7),
  ("z", 1, 8),
  ("z", 4, 9),
  ("z", 7, 4),
  ("a", 8, 5),
  ("a", 5, 2),
  ("a", 3, 8),
  ("x", 2, 7),
  ("z", 1, 9)]

In [16]:
initial_df = spark.createDataFrame(d1, ["col1", "col2", "col3"])

initial_df.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   x|   4|   1|
|   x|   6|   2|
|   z|   7|   3|
|   a|   3|   4|
|   z|   5|   2|
|   x|   7|   3|
|   x|   9|   7|
|   z|   1|   8|
|   z|   4|   9|
|   z|   7|   4|
|   a|   8|   5|
|   a|   5|   2|
|   a|   3|   8|
|   x|   2|   7|
|   z|   1|   9|
+----+----+----+



In [17]:
full_df = initial_df.groupBy("col1") \
               .agg(collect_list("col2").alias("array_col1"),
                    collect_list("col3").alias("array_col2"))

full_df.show()

full_df.printSchema()

+----+------------------+------------------+
|col1|        array_col1|        array_col2|
+----+------------------+------------------+
|   x|   [4, 6, 7, 9, 2]|   [1, 2, 3, 7, 7]|
|   z|[1, 4, 7, 1, 7, 5]|[8, 9, 4, 9, 3, 2]|
|   a|      [8, 5, 3, 3]|      [5, 2, 8, 4]|
+----+------------------+------------------+

root
 |-- col1: string (nullable = true)
 |-- array_col1: array (nullable = true)
 |    |-- element: long (containsNull = false)
 |-- array_col2: array (nullable = true)
 |    |-- element: long (containsNull = false)



In [18]:
df4 = full_df.drop("array_col1")

df4.show()

df4.printSchema()

+----+------------------+
|col1|        array_col2|
+----+------------------+
|   x|   [7, 1, 2, 3, 7]|
|   z|[8, 9, 4, 9, 3, 2]|
|   a|      [4, 5, 2, 8]|
+----+------------------+

root
 |-- col1: string (nullable = true)
 |-- array_col2: array (nullable = true)
 |    |-- element: long (containsNull = false)



___array_contains()___

If we need to find a particular element is present in array, we can use array_contains function.

This function returns true if the value is present in array and false otherwise.

In [19]:
arr_contains_df = df4.withColumn("result", array_contains("array_col2", 3))

arr_contains_df.show()

+----+------------------+------+
|col1|        array_col2|result|
+----+------------------+------+
|   x|   [1, 2, 3, 7, 7]|  true|
|   z|[3, 2, 8, 9, 4, 9]|  true|
|   a|      [4, 5, 2, 8]| false|
+----+------------------+------+



___array_distinct___

This function returns only distinct values from an array and removes duplicate values.

In [20]:
arr_distinct_df = df4.withColumn("result", array_distinct("array_col2"))

arr_distinct_df.show()

+----+------------------+---------------+
|col1|        array_col2|         result|
+----+------------------+---------------+
|   x|   [7, 1, 2, 3, 7]|   [7, 1, 2, 3]|
|   z|[3, 2, 8, 9, 4, 9]|[3, 2, 8, 9, 4]|
|   a|      [4, 5, 2, 8]|   [4, 5, 2, 8]|
+----+------------------+---------------+



___array_except___

This function returns the elements from first array which are not present in second array. This is logically equivalent to set subtract operation.

In [21]:
arr_except_df = full_df.withColumn("result", array_except("array_col1", "array_col2"))

arr_except_df.show()

+----+------------------+------------------+---------+
|col1|        array_col1|        array_col2|   result|
+----+------------------+------------------+---------+
|   x|   [4, 6, 7, 9, 2]|   [1, 2, 3, 7, 7]|[4, 6, 9]|
|   z|[1, 4, 7, 1, 7, 5]|[8, 9, 4, 9, 3, 2]|[1, 7, 5]|
|   a|      [3, 8, 5, 3]|      [4, 5, 2, 8]|      [3]|
+----+------------------+------------------+---------+



___array_intersect___

This function returns common elements from both arrays. This is logically equivalent to set intersection operation.

In [22]:
arr_intersect_df = full_df.withColumn("result", array_intersect("array_col1", "array_col2"))

arr_intersect_df.show()

+----+------------------+------------------+------+
|col1|        array_col1|        array_col2|result|
+----+------------------+------------------+------+
|   x|   [4, 6, 7, 9, 2]|   [1, 2, 3, 7, 7]|[7, 2]|
|   z|[1, 4, 7, 1, 7, 5]|[8, 9, 4, 9, 3, 2]|   [4]|
|   a|      [3, 8, 5, 3]|      [4, 5, 2, 8]|[8, 5]|
+----+------------------+------------------+------+



___array_join___

This Function joins all the array elements based on delimiter defined as the second argument.

Note: if there are any null values then we can replace with third argument (`nullReplacement`) with any string value.

In [23]:
arr_join_df = df4.withColumn("result", array_join("array_col2", ","))

arr_join_df.show()

arr_join_df.printSchema()

+----+------------------+-----------+
|col1|        array_col2|     result|
+----+------------------+-----------+
|   x|   [7, 1, 2, 3, 7]|  7,1,2,3,7|
|   z|[8, 9, 4, 9, 3, 2]|8,9,4,9,3,2|
|   a|      [4, 5, 2, 8]|    4,5,2,8|
+----+------------------+-----------+

root
 |-- col1: string (nullable = true)
 |-- array_col2: array (nullable = true)
 |    |-- element: long (containsNull = false)
 |-- result: string (nullable = true)



___array_max___

This function returns the maximum value from an array.

In [24]:
arr_max_df = df4.withColumn("result", array_max("array_col2"))

arr_max_df.show()

+----+------------------+------+
|col1|        array_col2|result|
+----+------------------+------+
|   x|   [1, 2, 3, 7, 7]|     7|
|   z|[3, 2, 8, 9, 4, 9]|     9|
|   a|      [5, 2, 8, 4]|     8|
+----+------------------+------+



___array_min___

This function returns the minimum value from an array.

In [25]:
arr_min_df = df4.withColumn("result", array_min("array_col2"))

arr_min_df.show()

+----+------------------+------+
|col1|        array_col2|result|
+----+------------------+------+
|   x|   [7, 1, 2, 3, 7]|     1|
|   z|[8, 9, 4, 9, 3, 2]|     2|
|   a|      [5, 2, 8, 4]|     2|
+----+------------------+------+



___array_position___

This function returns the position of first occurrence of a specified element. If the element is not present it returns 0.

In [26]:
arr_pos_df = df4.withColumn("result", array_position("array_col2", 7))

arr_pos_df.show()

+----+------------------+------+
|col1|        array_col2|result|
+----+------------------+------+
|   x|   [1, 2, 3, 7, 7]|     4|
|   z|[8, 9, 4, 9, 3, 2]|     0|
|   a|      [4, 5, 2, 8]|     0|
+----+------------------+------+



___array_remove___

This function removes all the occurrences of an element from an array.

In [27]:
arr_remove_df = df4.withColumn("result", array_remove("array_col2", 7))

arr_remove_df.show()

+----+------------------+------------------+
|col1|        array_col2|            result|
+----+------------------+------------------+
|   x|   [7, 1, 2, 3, 7]|         [1, 2, 3]|
|   z|[8, 9, 4, 9, 3, 2]|[8, 9, 4, 9, 3, 2]|
|   a|      [5, 2, 8, 4]|      [5, 2, 8, 4]|
+----+------------------+------------------+



___array_repeat___

This function creates an array that is repeated as specified by second argument.

In [28]:
arr_repeat_df = df4.withColumn("result", array_repeat("array_col2", 2))

arr_repeat_df.show(truncate = False)

+----+------------------+----------------------------------------+
|col1|array_col2        |result                                  |
+----+------------------+----------------------------------------+
|x   |[7, 1, 2, 3, 7]   |[[7, 1, 2, 3, 7], [7, 1, 2, 3, 7]]      |
|z   |[3, 2, 8, 9, 4, 9]|[[3, 2, 8, 9, 4, 9], [3, 2, 8, 9, 4, 9]]|
|a   |[4, 5, 2, 8]      |[[4, 5, 2, 8], [4, 5, 2, 8]]            |
+----+------------------+----------------------------------------+



___array_sort___

This function sorts the elements of an array in `ascending` order. Nulls will be placed at the end

In [29]:
arr_sort_df = df4.withColumn("result", array_sort("array_col2"))

arr_sort_df.show()

+----+------------------+------------------+
|col1|        array_col2|            result|
+----+------------------+------------------+
|   x|   [7, 1, 2, 3, 7]|   [1, 2, 3, 7, 7]|
|   z|[8, 9, 4, 9, 3, 2]|[2, 3, 4, 8, 9, 9]|
|   a|      [4, 5, 2, 8]|      [2, 4, 5, 8]|
+----+------------------+------------------+



___array_union___

This function returns the union of all elements from the input arrays.

In [30]:
arr_union_df = full_df.withColumn("result", array_union("array_col1", "array_col2"))

arr_union_df.show(truncate=False)

+----+------------------+------------------+------------------------+
|col1|array_col1        |array_col2        |result                  |
+----+------------------+------------------+------------------------+
|x   |[2, 4, 6, 7, 9]   |[7, 1, 2, 3, 7]   |[2, 4, 6, 7, 9, 1, 3]   |
|z   |[1, 4, 7, 1, 7, 5]|[8, 9, 4, 9, 3, 2]|[1, 4, 7, 5, 8, 9, 3, 2]|
|a   |[3, 8, 5, 3]      |[4, 5, 2, 8]      |[3, 8, 5, 4, 2]         |
+----+------------------+------------------+------------------------+



___arrays_overlap___

This function checks if at least one element is common/overlapping in arrays. 

It returns true if at least one element is common in both array and false otherwise. It returns null if at least one of the arrays is null.

In [31]:
arr_overlap_df = full_df.withColumn("result", arrays_overlap("array_col1", "array_col2"))

arr_overlap_df.show()

+----+------------------+------------------+------+
|col1|        array_col1|        array_col2|result|
+----+------------------+------------------+------+
|   x|   [4, 6, 7, 9, 2]|   [1, 2, 3, 7, 7]|  true|
|   z|[7, 5, 1, 4, 7, 1]|[3, 2, 8, 9, 4, 9]|  true|
|   a|      [8, 5, 3, 3]|      [5, 2, 8, 4]|  true|
+----+------------------+------------------+------+



___arrays_zip___

This function merges the i-th element of an array and returns array<struct>.

In [32]:
# remove element "2" from array column "array_col2"
temp_df = full_df.withColumn("new_array_col", array_remove("array_col2",2))

# zip column "array_col1" with newly created column "new_array_col"
arr_zip_df = temp_df.withColumn("result", arrays_zip("array_col1", "new_array_col")).select("array_col1", "new_array_col", "result")

arr_zip_df.show(truncate=False)

+------------------+---------------+----------------------------------------------+
|array_col1        |new_array_col  |result                                        |
+------------------+---------------+----------------------------------------------+
|[2, 4, 6, 7, 9]   |[7, 1, 3, 7]   |[[2, 7], [4, 1], [6, 3], [7, 7], [9,]]        |
|[7, 5, 1, 4, 7, 1]|[3, 8, 9, 4, 9]|[[7, 3], [5, 8], [1, 9], [4, 4], [7, 9], [1,]]|
|[8, 5, 3, 3]      |[5, 8, 4]      |[[8, 5], [5, 8], [3, 4], [3,]]                |
+------------------+---------------+----------------------------------------------+



___concat___

This function concatenates all the elements of both arrays into a single one.

In [33]:
arr_cat_df = full_df.withColumn("result", concat("array_col1", "array_col2"))

arr_cat_df.show(truncate=False)

arr_cat_df.printSchema()

+----+------------------+------------------+------------------------------------+
|col1|array_col1        |array_col2        |result                              |
+----+------------------+------------------+------------------------------------+
|x   |[2, 4, 6, 7, 9]   |[7, 1, 2, 3, 7]   |[2, 4, 6, 7, 9, 7, 1, 2, 3, 7]      |
|z   |[1, 4, 7, 1, 7, 5]|[8, 9, 4, 9, 3, 2]|[1, 4, 7, 1, 7, 5, 8, 9, 4, 9, 3, 2]|
|a   |[3, 8, 5, 3]      |[4, 5, 2, 8]      |[3, 8, 5, 3, 4, 5, 2, 8]            |
+----+------------------+------------------+------------------------------------+

root
 |-- col1: string (nullable = true)
 |-- array_col1: array (nullable = true)
 |    |-- element: long (containsNull = false)
 |-- array_col2: array (nullable = true)
 |    |-- element: long (containsNull = false)
 |-- result: array (nullable = true)
 |    |-- element: long (containsNull = false)



___element_at___

This function returns the element at a specified index.

In [34]:
arr_element_at_df = df4.withColumn("result", element_at("array_col2", 1))

arr_element_at_df.show()

+----+------------------+------+
|col1|        array_col2|result|
+----+------------------+------+
|   x|   [1, 2, 3, 7, 7]|     1|
|   z|[3, 2, 8, 9, 4, 9]|     3|
|   a|      [5, 2, 8, 4]|     5|
+----+------------------+------+



___flatten___

This function returns a single array from array of an arrays. 

If an array is more than 2 levels deep, it removes one level of nesting from an array.

In [35]:
# Generate the nested array using the function "array_repeat".
arr_repeat_df1 = df4.withColumn("repeat", array_repeat("array_col2", 2))

# flatten the nested array.
arr_flat_df1 = arr_repeat_df1.withColumn("result", flatten("repeat")).select("repeat", "result")

arr_flat_df1.show(truncate=False)

+----------------------------------------+------------------------------------+
|repeat                                  |result                              |
+----------------------------------------+------------------------------------+
|[[7, 1, 2, 3, 7], [7, 1, 2, 3, 7]]      |[7, 1, 2, 3, 7, 7, 1, 2, 3, 7]      |
|[[3, 2, 8, 9, 4, 9], [3, 2, 8, 9, 4, 9]]|[3, 2, 8, 9, 4, 9, 3, 2, 8, 9, 4, 9]|
|[[5, 2, 8, 4], [5, 2, 8, 4]]            |[5, 2, 8, 4, 5, 2, 8, 4]            |
+----------------------------------------+------------------------------------+



___map_from_arrays___

This function creates a map column. _Elements of the first column will be used for keys and second column will be used for values_.

In [36]:
full_df.show()

+----+------------------+------------------+
|col1|        array_col1|        array_col2|
+----+------------------+------------------+
|   x|   [2, 4, 6, 7, 9]|   [7, 1, 2, 3, 7]|
|   z|[7, 5, 1, 4, 7, 1]|[3, 2, 8, 9, 4, 9]|
|   a|      [3, 8, 5, 3]|      [4, 5, 2, 8]|
+----+------------------+------------------+



In [37]:
# remove duplicate keys from the dataframe, else job will fail
upd_full_df = full_df.filter("col1 = 'x'")

upd_full_df.show()

+----+---------------+---------------+
|col1|     array_col1|     array_col2|
+----+---------------+---------------+
|   x|[4, 6, 7, 9, 2]|[1, 2, 3, 7, 7]|
+----+---------------+---------------+



In [38]:
map_from_arr_df = upd_full_df.withColumn("result", map_from_arrays("array_col1", "array_col2")).drop("col1")

map_from_arr_df.show(truncate=False)

+---------------+---------------+----------------------------------------+
|array_col1     |array_col2     |result                                  |
+---------------+---------------+----------------------------------------+
|[4, 6, 7, 9, 2]|[1, 2, 3, 7, 7]|[4 -> 1, 6 -> 2, 7 -> 3, 9 -> 7, 2 -> 7]|
+---------------+---------------+----------------------------------------+



___reverse___

This function reverses the order of elements in input array.

In [39]:
arr_reverse_df = df4.withColumn("result", reverse("array_col2"))

arr_reverse_df.show()

+----+------------------+------------------+
|col1|        array_col2|            result|
+----+------------------+------------------+
|   x|   [1, 2, 3, 7, 7]|   [7, 7, 3, 2, 1]|
|   z|[3, 2, 8, 9, 4, 9]|[9, 4, 9, 8, 2, 3]|
|   a|      [4, 5, 2, 8]|      [8, 2, 5, 4]|
+----+------------------+------------------+



__size__

This function returns a number of elements in an array or map.

In [40]:
arr_size_df = df4.withColumn("result", size("array_col2"))

arr_size_df.show()

+----+------------------+------+
|col1|        array_col2|result|
+----+------------------+------+
|   x|   [7, 1, 2, 3, 7]|     5|
|   z|[8, 9, 4, 9, 3, 2]|     6|
|   a|      [5, 2, 8, 4]|     4|
+----+------------------+------+



___shuffle___

This function shuffles the elements of array randomly.

In [41]:
arr_shuffle_df = df4.withColumn("result", shuffle("array_col2"))

arr_shuffle_df.show()

+----+------------------+------------------+
|col1|        array_col2|            result|
+----+------------------+------------------+
|   x|   [7, 1, 2, 3, 7]|   [3, 7, 7, 2, 1]|
|   z|[3, 2, 8, 9, 4, 9]|[4, 9, 3, 8, 9, 2]|
|   a|      [5, 2, 8, 4]|      [4, 8, 5, 2]|
+----+------------------+------------------+



___slice___

This function slices the array into a sub-array. We can specify the start of the index as second argument and number of elements as third argument.

__Note__: Arrays in spark start with index 1. It also supports negative indexing to access the elements from last.

Let’s try to create a sub-array of 3 elements starting from index 2.

In [42]:
arr_slice_df = df4.withColumn("result", slice("array_col2", 2, 3))

arr_slice_df.show()

+----+------------------+---------+
|col1|        array_col2|   result|
+----+------------------+---------+
|   x|   [1, 2, 3, 7, 7]|[2, 3, 7]|
|   z|[8, 9, 4, 9, 3, 2]|[9, 4, 9]|
|   a|      [5, 2, 8, 4]|[2, 8, 4]|
+----+------------------+---------+



___sort_array___

This function sorts the array in ascending order by default. However, we can sort in __descending order__ with second arg as __asc=False__.

In [43]:
arr_sort_df = df4.withColumn("result", sort_array("array_col2", asc=False))

arr_sort_df.show()

+----+------------------+------------------+
|col1|        array_col2|            result|
+----+------------------+------------------+
|   x|   [1, 2, 3, 7, 7]|   [7, 7, 3, 2, 1]|
|   z|[8, 9, 4, 9, 3, 2]|[9, 9, 8, 4, 3, 2]|
|   a|      [5, 2, 8, 4]|      [8, 5, 4, 2]|
+----+------------------+------------------+



___explode___

With array type

In [44]:
temp_df = df4.withColumn("slice_col", slice("array_col2", 1, 2)) \
              .drop("array_col2")

temp_df.show()

arr_explode_df = temp_df.withColumn("result", explode("slice_col"))

arr_explode_df.show()

+----+---------+
|col1|slice_col|
+----+---------+
|   x|   [7, 1]|
|   z|   [3, 2]|
|   a|   [4, 5]|
+----+---------+

+----+---------+------+
|col1|slice_col|result|
+----+---------+------+
|   x|   [1, 2]|     1|
|   x|   [1, 2]|     2|
|   z|   [3, 2]|     3|
|   z|   [3, 2]|     2|
|   a|   [4, 5]|     4|
|   a|   [4, 5]|     5|
+----+---------+------+



___posexplode___

This function creates a new row for each element with position of an array or map.

In [45]:
arr_posexplode_df = temp_df.select("*", posexplode("slice_col"))

arr_posexplode_df.show(truncate=False)

+----+---------+---+---+
|col1|slice_col|pos|col|
+----+---------+---+---+
|x   |[1, 2]   |0  |1  |
|x   |[1, 2]   |1  |2  |
|z   |[3, 2]   |0  |3  |
|z   |[3, 2]   |1  |2  |
|a   |[5, 2]   |0  |5  |
|a   |[5, 2]   |1  |2  |
+----+---------+---+---+

