In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('practise14').getOrCreate()

In [23]:
data = [(1,'sujeet',['python','ml']),(2,'ajeet',['java','sql'])]
schema = ['id', 'name', 'skills']

df = spark.createDataFrame(data, schema)
df.show()
df.printSchema()

+---+------+------------+
| id|  name|      skills|
+---+------+------------+
|  1|sujeet|[python, ml]|
|  2| ajeet| [java, sql]|
+---+------+------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



# explode ()

In [24]:
from pyspark.sql.functions import explode,col

df.withColumn('skill', explode(col('skills'))).show()

+---+------+------------+------+
| id|  name|      skills| skill|
+---+------+------------+------+
|  1|sujeet|[python, ml]|python|
|  1|sujeet|[python, ml]|    ml|
|  2| ajeet| [java, sql]|  java|
|  2| ajeet| [java, sql]|   sql|
+---+------+------------+------+



In [25]:
df.show()

+---+------+------------+
| id|  name|      skills|
+---+------+------------+
|  1|sujeet|[python, ml]|
|  2| ajeet| [java, sql]|
+---+------+------------+



# split( )

In [38]:
from pyspark.sql.functions import split

data = [(1,'sujeet','python,ml'),(2,'ajeet','java,sql')]
schema = ['id', 'name', 'skills']

df = spark.createDataFrame(data, schema)
df1 = df.withColumn('skillsArray', split(col('skills'),','))
df1.show()

+---+------+---------+------------+
| id|  name|   skills| skillsArray|
+---+------+---------+------------+
|  1|sujeet|python,ml|[python, ml]|
|  2| ajeet| java,sql| [java, sql]|
+---+------+---------+------------+



# array()

In [42]:
from pyspark.sql.functions import split

data = [(1,'sujeet','python','ml'),(2,'ajeet','java','sql')]
schema = ['id', 'name', 'primaryskills', 'secondaryskills']

df4 = spark.createDataFrame(data, schema)
df4.show()

+---+------+-------------+---------------+
| id|  name|primaryskills|secondaryskills|
+---+------+-------------+---------------+
|  1|sujeet|       python|             ml|
|  2| ajeet|         java|            sql|
+---+------+-------------+---------------+



In [44]:
df4.withColumn('skills', array(col('primaryskills'), col('secondaryskills'))).show()

+---+------+-------------+---------------+------------+
| id|  name|primaryskills|secondaryskills|      skills|
+---+------+-------------+---------------+------------+
|  1|sujeet|       python|             ml|[python, ml]|
|  2| ajeet|         java|            sql| [java, sql]|
+---+------+-------------+---------------+------------+



# array_contains()

In [46]:
from pyspark.sql.functions import array_contains

In [45]:
data = [(1,'sujeet',['python','ml']),(2,'ajeet',['java','sql'])]
schema = ['id', 'name', 'skills']

df5 = spark.createDataFrame(data, schema)
df5.show()

+---+------+------------+
| id|  name|      skills|
+---+------+------------+
|  1|sujeet|[python, ml]|
|  2| ajeet| [java, sql]|
+---+------+------------+



In [47]:
df5.withColumn('skillspresent', array_contains('skills','python')).show()

+---+------+------------+-------------+
| id|  name|      skills|skillspresent|
+---+------+------------+-------------+
|  1|sujeet|[python, ml]|         true|
|  2| ajeet| [java, sql]|        false|
+---+------+------------+-------------+

