In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import ArrayType, StructType, StructField, StringType
spark = SparkSession.builder.getOrCreate()

# Example:1

In [7]:
data = [
    ("Siva,sai,Yadav",["Python","Java","C"],["Python","Spark"],"KAR","TEL"),
    ("Michael,Rose,",["Spark","Java","C++"],["Java","Spark"],"DEL","MUM"),
    ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

schema = StructType([
    StructField('Name',StringType(),True),
    StructField('languagesAtSchool',ArrayType(StringType()),True),
    StructField('languagesAtWork',ArrayType(StringType()),True),
    StructField('currentState',StringType(),True),
    StructField('previousState',StringType(),True)
])
                    
df = spark.createDataFrame(data=data,schema=schema)

In [8]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)



In [9]:
df.show(5)

+----------------+------------------+---------------+------------+-------------+
|            Name| languagesAtSchool|languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|  Siva,sai,Yadav| [Python, Java, C]|[Python, Spark]|         KAR|          TEL|
|   Michael,Rose,|[Spark, Java, C++]|  [Java, Spark]|         DEL|          MUM|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|
+----------------+------------------+---------------+------------+-------------+



# explode() & split()

In [16]:
df.select('Name',explode(df.languagesAtSchool)).show()

+----------------+------+
|            Name|   col|
+----------------+------+
|  Siva,sai,Yadav|Python|
|  Siva,sai,Yadav|  Java|
|  Siva,sai,Yadav|     C|
|   Michael,Rose,| Spark|
|   Michael,Rose,|  Java|
|   Michael,Rose,|   C++|
|Robert,,Williams|CSharp|
|Robert,,Williams|    VB|
+----------------+------+



In [20]:
df_1 = df.select(split(df.Name,',').alias('Name'))

In [30]:
df_1.select(explode(df_1.Name).alias('Name')).show()

+--------+
|    Name|
+--------+
|    Siva|
|     sai|
|   Yadav|
| Michael|
|    Rose|
|        |
|  Robert|
|        |
|Williams|
+--------+



In [39]:
df_1.select(explode(df_1.Name).alias('Name')).filter('Name != ""').show()

+--------+
|    Name|
+--------+
|    Siva|
|     sai|
|   Yadav|
| Michael|
|    Rose|
|  Robert|
|Williams|
+--------+



In [44]:
df.select(explode(split(df.Name,',').alias('Name'))).filter('col != ""').show()

+--------+
|     col|
+--------+
|    Siva|
|     sai|
|   Yadav|
| Michael|
|    Rose|
|  Robert|
|Williams|
+--------+



In [47]:
new_col = df_1.select(explode(df_1.Name).alias('Name'))
new_col.show()

+--------+
|    Name|
+--------+
|    Siva|
|     sai|
|   Yadav|
| Michael|
|    Rose|
|        |
|  Robert|
|        |
|Williams|
+--------+



In [50]:
df.withColumn('New COl',explode(split(df.Name,','))).show()

+----------------+------------------+---------------+------------+-------------+--------+
|            Name| languagesAtSchool|languagesAtWork|currentState|previousState| New COl|
+----------------+------------------+---------------+------------+-------------+--------+
|  Siva,sai,Yadav| [Python, Java, C]|[Python, Spark]|         KAR|          TEL|    Siva|
|  Siva,sai,Yadav| [Python, Java, C]|[Python, Spark]|         KAR|          TEL|     sai|
|  Siva,sai,Yadav| [Python, Java, C]|[Python, Spark]|         KAR|          TEL|   Yadav|
|   Michael,Rose,|[Spark, Java, C++]|  [Java, Spark]|         DEL|          MUM| Michael|
|   Michael,Rose,|[Spark, Java, C++]|  [Java, Spark]|         DEL|          MUM|    Rose|
|   Michael,Rose,|[Spark, Java, C++]|  [Java, Spark]|         DEL|          MUM|        |
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|  Robert|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|        |
|Robert,,W

# array()

In [55]:
df.select('Name',array('currentState','previousState').alias('New')).show()

+----------------+----------+
|            Name|       New|
+----------------+----------+
|  Siva,sai,Yadav|[KAR, TEL]|
|   Michael,Rose,|[DEL, MUM]|
|Robert,,Williams|  [UT, NV]|
+----------------+----------+



# array_contains()

In [56]:
df.select('languagesAtSchool',array_contains('languagesAtSchool','Python').alias('Check')).show()

+------------------+-----+
| languagesAtSchool|Check|
+------------------+-----+
| [Python, Java, C]| true|
|[Spark, Java, C++]|false|
|      [CSharp, VB]|false|
+------------------+-----+



# Example:2

In [57]:
data1 = [
    ('Siva',['Java','Scala'],{'hair':'black','eye':'brown'}),
    ('Sai',['C','Python',None],{'hair':'brown','eye':None}),
    ('Yadav',['CSharp',''],{'hair':'brown','eye':''}),
    ('AAAA',None,None),
    ('ZZZZ',['1','2'],{})
        ]

df1 = spark.createDataFrame(data=data1,schema=['Name','Lang','Prop'])

In [59]:
df1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Lang: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Prop: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [60]:
df1.show(truncate=False)

+-----+-----------------+-----------------------------+
|Name |Lang             |Prop                         |
+-----+-----------------+-----------------------------+
|Siva |[Java, Scala]    |{eye -> brown, hair -> black}|
|Sai  |[C, Python, null]|{eye -> null, hair -> brown} |
|Yadav|[CSharp, ]       |{eye -> , hair -> brown}     |
|AAAA |null             |null                         |
|ZZZZ |[1, 2]           |{}                           |
+-----+-----------------+-----------------------------+



In [61]:
df1.select(explode('Lang')).show()

+------+
|   col|
+------+
|  Java|
| Scala|
|     C|
|Python|
|  null|
|CSharp|
|      |
|     1|
|     2|
+------+



In [72]:
df2 = df1.select(explode('Prop'))
df2.show()

+----+-----+
| key|value|
+----+-----+
| eye|brown|
|hair|black|
| eye| null|
|hair|brown|
| eye|     |
|hair|brown|
+----+-----+



In [73]:
df1.select(df1.Name,posexplode(df1.Lang)).show()

+-----+---+------+
| Name|pos|   col|
+-----+---+------+
| Siva|  0|  Java|
| Siva|  1| Scala|
|  Sai|  0|     C|
|  Sai|  1|Python|
|  Sai|  2|  null|
|Yadav|  0|CSharp|
|Yadav|  1|      |
| ZZZZ|  0|     1|
| ZZZZ|  1|     2|
+-----+---+------+

