In [None]:
#create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").\
                                     appName("spark_on_docker").\
                                     getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", 5)

In [80]:
# ######################################
# pyspark-explode-array-map
# ######################################

arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})
        ]
df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])
df.printSchema()
df.show()

from pyspark.sql.functions import explode
df2 = df.select(df.name,explode(df.knownLanguages))
df2.printSchema()
df2.show()

from pyspark.sql.functions import explode
df3 = df.select(df.name,explode(df.properties))
df3.printSchema()
df3.show()

from pyspark.sql.functions import explode_outer
""" with array """
df.select(df.name,explode_outer(df.knownLanguages)).show()
""" with map """
df.select(df.name,explode_outer(df.properties)).show()


from pyspark.sql.functions import posexplode
""" with array """
df.select(df.name,posexplode(df.knownLanguages)).show()
""" with map """
df.select(df.name,posexplode(df.properties)).show()

from pyspark.sql.functions import posexplode_outer
""" with array """
df.select(df.name,posexplode_outer(df.knownLanguages)).show()

""" with map """
df.select(df.name,posexplode_outer(df.properties)).show()


"""END"""

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+-------------------+--------------------+
|      name|     knownLanguages|          properties|
+----------+-------------------+--------------------+
|     James|      [Java, Scala]|{eye -> brown, ha...|
|   Michael|[Spark, Java, null]|{eye -> null, hai...|
|    Robert|         [CSharp, ]|{eye -> , hair ->...|
|Washington|               null|                null|
| Jefferson|             [1, 2]|                  {}|
+----------+-------------------+--------------------+

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)

+---------+------+
|     name|   col|
+---------+------+
|    James|  Java|
|    James| Scala|
|  Michael| Spark|
|  Michael|  Java|
|  Michael|  null|
|   Robert|CSharp|
|   Robert|      |

'END'

In [85]:
# ######################################
# pyspark-explode-nested-array
# ######################################

arrayArrayData = [
  ("James",[["Java","Scala","C++"],["Spark","Java"]]),
  ("Michael",[["Spark","Java","C++"],["Spark","Java"]]),
  ("Robert",[["CSharp","VB"],["Spark","Python"]])
]

df = spark.createDataFrame(data=arrayArrayData, schema = ['name','subjects'])
df.printSchema()
df.show(truncate=False)

""" """
df.select(df.name,explode(df.subjects)).show(truncate=False)

""" creates a single array from an array of arrays. """
df.select(df.name,flatten(df.subjects)).show(truncate=False)

"""END"""

column ['name', 'subjects']
root
 |-- name: string (nullable = true)
 |-- subjects: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)

+-------+-----------------------------------+
|name   |subjects                           |
+-------+-----------------------------------+
|James  |[[Java, Scala, C++], [Spark, Java]]|
|Michael|[[Spark, Java, C++], [Spark, Java]]|
|Robert |[[CSharp, VB], [Spark, Python]]    |
+-------+-----------------------------------+

+-------+------------------+
|name   |col               |
+-------+------------------+
|James  |[Java, Scala, C++]|
|James  |[Spark, Java]     |
|Michael|[Spark, Java, C++]|
|Michael|[Spark, Java]     |
|Robert |[CSharp, VB]      |
|Robert |[Spark, Python]   |
+-------+------------------+

+-------+-------------------------------+
|name   |flatten(subjects)              |
+-------+-------------------------------+
|James  |[Java, Scala, C++, Spark, Java]|
|Michael|

'END'

In [86]:
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ 
    StructField("firstname",StringType(),True), 
    StructField("middlename",StringType(),True), 
    StructField("lastname",StringType(),True), 
    StructField("id", StringType(), True), 
    StructField("gender", StringType(), True), 
    StructField("salary", IntegerType(), True) 
  ])
 
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)


updatedDF = df2.withColumn("OtherInfo", 
    struct(col("id").alias("identifier"),
    col("gender").alias("gender"),
    col("salary").alias("salary"),
    when(col("salary").cast(IntegerType()) < 2000,"Low")
      .when(col("salary").cast(IntegerType()) < 4000,"Medium")
      .otherwise("High").alias("Salary_Grade")
  )).drop("id","gender","salary")

updatedDF.printSchema()
updatedDF.show(truncate=False)


""" Array & Map"""


arrayStructureSchema = StructType([
    StructField('name', StructType([
       StructField('firstname', StringType(), True),
       StructField('middlename', StringType(), True),
       StructField('lastname', StringType(), True)
       ])),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),StringType()), True)
    ])

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [90]:
import requests
import xml.etree.ElementTree as ET

from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *

cds_url = 'https://www.w3schools.com/xml/cd_catalog.xml'

# download data
cds_txt = requests.get(cds_url).text

# convert to XML
doc = ET.fromstring(cds_txt)

# extract CD XML
utf8 = 'utf-8'
cds = [ET.tostring(x, encoding=utf8) for x in doc.findall('CD')]

# create dataframe
normalizedCds = [str(cd, utf8).strip() for cd in cds]
rows = [Row(index=index, cd=cd) for index, cd in enumerate(normalizedCds)]
cd_df = spark.createDataFrame(rows)



In [92]:
cd_df.show(3)

+-----+--------------------+
|index|                  cd|
+-----+--------------------+
|    0|<CD>\n    <TITLE>...|
|    1|<CD>\n    <TITLE>...|
|    2|<CD>\n    <TITLE>...|
+-----+--------------------+
only showing top 3 rows



In [93]:
@udf
def extract_title_udf(payload):
  doc = ET.fromstring(payload)
  result = [e.text for e in doc.findall('TITLE') if isinstance(e, ET.Element)]
  return next(iter(result), None)

In [96]:
cd_df\
 .select("index", extract_title_udf(col('cd')).alias('title'))\
 .show(10, False)

+-----+------------------------+
|index|title                   |
+-----+------------------------+
|0    |Empire Burlesque        |
|1    |Hide your heart         |
|2    |Greatest Hits           |
|3    |Still got the blues     |
|4    |Eros                    |
|5    |One night only          |
|6    |Sylvias Mother          |
|7    |Maggie May              |
|8    |Romanza                 |
|9    |When a man loves a woman|
+-----+------------------------+
only showing top 10 rows



In [97]:
extract_cd_info_schema = StructType([
    StructField("title", StringType(), True),
    StructField("artist", StringType(), True)
])

def select_text(doc, xpath):
  nodes = [e.text for e in doc.findall(xpath) if isinstance(e, ET.Element)]
  return next(iter(nodes), None)

def extract_cd_info(payload):
  doc = ET.fromstring(payload)
  return {
    'title':  select_text(doc, 'TITLE'),
    'artist': select_text(doc, 'ARTIST')
  }

extract_cd_info_udf = udf(extract_cd_info, extract_cd_info_schema)

In [98]:
cd_df\
 .withColumn("info", extract_cd_info_udf('cd'))\
 .select('index', 'info.artist', 'info.title')\
 .show(10, False)

+-----+---------------+------------------------+
|index|artist         |title                   |
+-----+---------------+------------------------+
|0    |Bob Dylan      |Empire Burlesque        |
|1    |Bonnie Tyler   |Hide your heart         |
|2    |Dolly Parton   |Greatest Hits           |
|3    |Gary Moore     |Still got the blues     |
|4    |Eros Ramazzotti|Eros                    |
|5    |Bee Gees       |One night only          |
|6    |Dr.Hook        |Sylvias Mother          |
|7    |Rod Stewart    |Maggie May              |
|8    |Andrea Bocelli |Romanza                 |
|9    |Percy Sledge   |When a man loves a woman|
+-----+---------------+------------------------+
only showing top 10 rows

