In [None]:
import time
import os, gc
import pyspark
import pandas as pd
from pyspark.sql.types import *
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, ArrayType, StringType
import xml.etree.ElementTree as ET
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.15.0,org.apache.hadoop:hadoop-core:1.2.1 pyspark-shell'

In [None]:
spark = SparkSession.builder \
        .config("spark.driver.memory", "32g") \
        .master("spark://cm013:47322").getOrCreate()
spark

In [None]:
tags = ["title", "text", "ip", "id"]
df = spark.read.format("com.databricks.spark.xml") \
        .option("rootTag", "mediawiki") \
        .option("rowTag", "page") \
        .option("excludeAttribute", True) \
        .load("dump.xml") \
        .select(*tags)

In [None]:
inputFile = "small_dump.xml"
xmlDF = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "root") \
    .option("rowTag", "page") \
    .load(inputFile)

xmlDF.show()


In [None]:
from pyspark.sql.functions import udf, explode, expr
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

def parse_xml(xml_string):
    print("hello")
    root = ET.fromstring(xml_string)
    print(xml_string)
    return [(
        child.findtext("title"),
        child.findtext("text")
    ) for child in root.iter("page")]

parse_xml_udf = udf(parse_xml, ArrayType(
    StructType([
        StructField("title", StringType(), True),
        StructField("text", StringType(), True)
    ])
))

In [None]:
df = spark.read.format("xml") \
    .option("rootTag", "mediawiki") \
    .option("rowTag", "page") \
    .option("inferSchema", "true") \
    .option("mode", "DROPMALFORMED") \
    .load("dump.xml") \
    .withColumn("page", explode(expr("split(xml, '</page>')"))) \
    .selectExpr("xpath_string(page, 'page/title/text()') AS title",
                "xpath_string(page, 'page/id/text()') AS id",
                "xpath_string(page, 'page/revision/contributor/username/text()') AS contributor_username",
                "xpath_string(page, 'page/revision/contributor/id/text()') AS contributor_id",)

In [None]:
import os
os.remove("remp.xml")

In [None]:
df = spark.read.format("xml") \
    .option("rootTag", "mediawiki") \
    .option("rowTag", "page") \
    .option("xpath", "concat(substring-before(., '</page>'), '</page>') AS xml") \
    .load("dump.xml") \
    .repartition(10)

parse_xml_udf = udf(lambda x: parse_xml(x, tags_to_extract), tags_schema)

df = df.selectExpr("explode(split(xml, '</page>)')) as page")

df = df.selectExpr("page AS xml").select(parse_xml_udf("xml").alias("page")).select("page.*")

tags_to_extract=["id", "ip", "title"]
df = df.select([c for c in df.columns if c in tags_to_extract])


In [None]:
file_rdd = spark.read.text("dump.xml")
df = spark.read.format("com.databricks.spark.xml").option("rowTag", "page").option("wholeFile", "true")

In [None]:
xml_rdd = spark.sparkContext.newAPIHadoopFile(
    'dump.xml',
    'com.databricks.spark.xml.XmlInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text',
)

In [None]:
def iterate_tree(element, data):
    if element.tag == "title":
        data[0] = element.text
    elif element.tag == "text":
        data[1] = element.text
    elif element.tag == "ip":
        data[2].append(element.text)
    elif element.tag == "username":
        data[3].append(element.text)
    for child in element:
        data = iterate_tree(child, data)
    return data


In [None]:
schema = StructType([
    StructField('title', StringType(), True),
    StructField('text', StringType(), True),
    StructField('ips', ArrayType(StringType()), True),
    StructField('usernames', ArrayType(StringType()), True)
])


parsed_rdd = xml_rdd.map(lambda x: ET.fromstring(x[1])) \
    .map(lambda x: iterate_tree(x, ["","",[],[]])) \
    .map(lambda x: tuple(x))

df = spark.createDataFrame(parsed_rdd, schema)

df.show()

In [None]:
df.head(5)

In [None]:
sc = SparkContext(appName="XML to RDD").getOrCreate()

In [None]:
df.head()

In [None]:
import pandas as pd

df = pd.read_csv("output.csv")
df.head()

In [None]:
df.iloc[2]["{http://www.mediawiki.org/xml/export-0.10/}text"]