In [9]:
%pip install -q xmltodict

Note: you may need to restart the kernel to use updated packages.


In [10]:
import pyspark

MAX_MEMORY = "8g"  # 24 gives OOM here.

spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") 
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
    .config("spark.executor.memory", MAX_MEMORY) 
    .config("spark.driver.memory", MAX_MEMORY) 
    .enableHiveSupport() 
    .getOrCreate()        
    )
spark

In [11]:
import xmltodict
import json
def xml_to_json(x):
    try:
        return json.dumps(xmltodict.parse(x))
    except:
        return None
spark.udf.register("xml_to_json",xml_to_json)

<function __main__.xml_to_json(x)>

In [12]:
import requests
xml = requests.get('https://www.w3schools.com/xml/plant_catalog.xml').text
df_with_xml = spark.createDataFrame([[xml]],'xml string')

In [19]:
df_with_json = df_with_xml.selectExpr('*','xml_to_json(xml) as json')
df_with_json.show()

+--------------------+--------------------+
|                 xml|                json|
+--------------------+--------------------+
|<?xml version="1....|{"CATALOG": {"PLA...|
+--------------------+--------------------+



In [14]:
json_schema = spark.read.json(df_with_json.rdd.map(lambda row: row.json)).schema
print(json_schema)

StructType(List(StructField(CATALOG,StructType(List(StructField(PLANT,ArrayType(StructType(List(StructField(AVAILABILITY,StringType,true),StructField(BOTANICAL,StringType,true),StructField(COMMON,StringType,true),StructField(LIGHT,StringType,true),StructField(PRICE,StringType,true),StructField(ZONE,StringType,true))),true),true))),true)))


In [15]:
json_schema.simpleString()

'struct<CATALOG:struct<PLANT:array<struct<AVAILABILITY:string,BOTANICAL:string,COMMON:string,LIGHT:string,PRICE:string,ZONE:string>>>>'

In [24]:
import pyspark.sql.functions as psf
df_with_structured_data = df_with_json.withColumn('structured_data', psf.from_json(psf.col('json'),json_schema))

In [30]:
df_with_structured_data.createOrReplaceTempView('view_with_structured_data')

In [37]:
final_clean_df = spark.sql('''
  select plant.* 
  from view_with_structured_data
  lateral view explode(structured_data.catalog.plant) as plant
''')


In [38]:
final_clean_df.toPandas()

Unnamed: 0,AVAILABILITY,BOTANICAL,COMMON,LIGHT,PRICE,ZONE
0,31599,Sanguinaria canadensis,Bloodroot,Mostly Shady,$2.44,4
1,30699,Aquilegia canadensis,Columbine,Mostly Shady,$9.37,3
2,51799,Caltha palustris,Marsh Marigold,Mostly Sunny,$6.81,4
3,30699,Caltha palustris,Cowslip,Mostly Shady,$9.90,4
4,12099,Dicentra cucullaria,Dutchman's-Breeches,Mostly Shady,$6.44,3
5,41899,Asarum canadense,"Ginger, Wild",Mostly Shady,$9.03,3
6,12699,Hepatica americana,Hepatica,Mostly Shady,$4.45,4
7,10299,Hepatica americana,Liverleaf,Mostly Shady,$3.99,4
8,20199,Arisaema triphyllum,Jack-In-The-Pulpit,Mostly Shady,$3.23,4
9,60599,Podophyllum peltatum,Mayapple,Mostly Shady,$2.98,3
