In [1]:
# Basics of XML
# XML (eXtensible Markup Language) is a markup language used to store and transport data. It is both human-readable
#  and machine-readable. Here's a simple example of an XML file:

In [9]:
<note>
  <to>Tove</to>
  <from>Jani</from>
  <heading>Reminder</heading>
  <body>Don't forget me this weekend!</body>
</note>

SyntaxError: unterminated string literal (detected at line 5) (3117821911.py, line 5)

In [13]:
# Basic XML Parsing in Python
# We'll use Python's built-in xml.etree.ElementTree module for parsing XML files.

# 1. Reading and Parsing XML
# Let's start with reading and parsing a simple XML file.

# Example: 

In [14]:
import xml.etree.ElementTree as ET

# Sample XML content
xml_data = """
<note>
  <to>Tove</to>
  <from>Jani</from>
  <heading>Reminder</heading>
  <body>Don't forget me this weekend!</body>
</note>
"""

# Parse XML
root = ET.fromstring(xml_data)

# Access elements
to = root.find('to').text
from_ = root.find('from').text
heading = root.find('heading').text
body = root.find('body').text

print(f'To: {to}')
print(f'From: {from_}')
print(f'Heading: {heading}')
print(f'Body: {body}')


To: Tove
From: Jani
Heading: Reminder
Body: Don't forget me this weekend!


In [19]:
# Navigating through XML Elements
# For more complex XML structures, you may need to navigate through nested elements and attributes.

# Example:

In [21]:
import xml.etree.ElementTree as ET

# Sample XML content
xml_data = """
<library>
  <book id="1">
    <title>1984</title>
    <author>George Orwell</author>
  </book>
  <book id="2">
    <title>To Kill a Mockingbird</title>
    <author>Harper Lee</author>
  </book>
</library>
"""

# Parse XML
root = ET.fromstring(xml_data)

# Iterate through books
for book in root.findall('book'): # Find all is used to find all the elements with the tag name 'book' in the XML
    book_id = book.get('id')
    title = book.find('title').text
    author = book.find('author').text
    print(f'Book ID: {book_id}, Title: {title}, Author: {author}')

Book ID: 1, Title: 1984, Author: George Orwell
Book ID: 2, Title: To Kill a Mockingbird, Author: Harper Lee


In [22]:
# Handling Namespaces
# Namespaces can make XML parsing more complex. Here's an example of handling namespaces.

# Example:

In [30]:
import xml.etree.ElementTree as ET

# Sample XML content
xml_data = """
<root xmlns:h="http://www.w3.org/TR/html4/">
  <h:table>
    <h:tr>
      <h:td>Apples</h:td>
      <h:td>Bananas</h:td>
    </h:tr>
  </h:table>
</root>
"""

# Parse XML with namespaces
namespaces = {'h': 'http://www.w3.org/TR/html4/'} # Namspace means h is the prefix for the URL http://www.w3.org/TR/html4/
root = ET.fromstring(xml_data)

# Access elements with namespaces using findall.
# Find all 'td' elements in the XML content and print their text.
#  ../h:td means find all 'td' elements in the XML content. Here .// means search in the entire XML content.
for td in root.findall('.//h:td', namespaces):
    print(td.text)

print ("-"*20)
# Lets see an example of multiple namespaces
# xmlns means XML namespace. It is used to avoid name conflicts.
# in root we have two namespaces h and f and they denote two different URLs.
# we can use the prefix to search the elements in the XML content.

xml_data = """
<root xmlns:h="http://www.w3.org/TR/html4/" xmlns:f="https://www.w3schools.com/furniture">
  <h:table>
    <h:tr>
      <h:td>Apples</h:td>
      <h:td>Bananas</h:td>
      <h:td>Oranges</h:td>
    </h:tr>
  </h:table>
  <f:table>
    <f:name>African Coffee Table</f:name>
    <f:width>80</f:width>
    <f:length>120</f:length>
    <f:td>kiwi</f:td>
  </f:table>
</root>
"""

# Parse XML with multiple namespaces
namespaces = {'h': 'http://www.w3.org/TR/html4/', 'f': 'https://www.w3schools.com/furniture'}
# we define namespace as a dictionary with prefix as key and URL as value.
# this will help us to use the prefix in the search query. 

root = ET.fromstring(xml_data)

# Access elements with namespaces using findall.
# Find all 'td' elements in the XML content and print their text.

for td in root.findall('.//f:td', namespaces):
    print(td.text)

print("-"*20)

for td in root.findall('.//h:td', namespaces):
    print(td.text)

Apples
Bananas
--------------------
kiwi
--------------------
Apples
Bananas
Oranges


In [31]:
# Modifying XML
# You can also modify XML content and write it back to a file.

# Example:

In [32]:
import xml.etree.ElementTree as ET

# Sample XML content
xml_data = """
<note>
  <to>Tove</to>
  <from>Jani</from>
  <heading>Reminder</heading>
  <body>Don't forget me this weekend!</body>
</note>
"""

# Parse XML
root = ET.fromstring(xml_data)

# Modify an element
root.find('body').text = "Don't forget our meeting this weekend!"

# Write back to a file
tree = ET.ElementTree(root)
tree.write('modified_note.xml')


In [34]:
# Advanced Data Engineering with XML

# For data engineering, you might deal with large XML files and need to use efficient methods
# for parsing and extracting data.

# Example: Parsing Large XML with Iterative Parsing

In [38]:
import xml.etree.ElementTree as ET

# Function to parse large XML files efficiently
def parse_large_xml(file_path):
    # Use iterparse to parse large XML files efficiently and avoid loading the entire file into memory
    # iterparse is an iterative parser that only loads parts of the XML file into memory as needed.
    # This is useful for large XML files that don't fit into memory.
    
    # We can use iterparse to parse the XML file in an iterative manner and clear the root element after
    # processing each element.
    context = ET.iterparse(file_path, events=("start", "end"))
    # iterparse takes the file path and events as arguments. Here, we specify "start" and "end" events.
    # This means that the parser will generate events when it encounters the start and end of an element.
    # start event is generated when the parser encounters the start of an element.
    # end event is generated when the parser encounters the end of an element which means the element is fully parsed.
    context = iter(context)
    # We use the iter() function to create an iterator from the context object.
    # context object is an iterator that generates events when the parser encounters the start and end of an element.
    event, root = next(context)
    # next function is used to get the next event and root element from the context iterator.
    # Iterate through the XML content
    # We iterate through the XML content using a for loop and process each element.
    for event, elem in context:
        if event == "end" and elem.tag == "book":
            # elem.tag is used to get the tag name of the element.
            # Here, we check if the tag name is "book" and process the element.
            book_id = elem.get('id')
            title = elem.find('title').text
            author = elem.find('author').text
            print(f'Book ID: {book_id}, Title: {title}, Author: {author}')
            root.clear()

# Example file path
file_path = 'example.xml'
parse_large_xml(file_path)


Book ID: 1, Title: 1984, Author: George Orwell
Book ID: 2, Title: To Kill a Mockingbird, Author: Harper Lee
Book ID: 3, Title: The Great Gatsby, Author: F. Scott Fitzgerald
Book ID: 4, Title: Catch-22, Author: Joseph Heller
Book ID: 5, Title: The Catcher in the Rye, Author: J.D. Salinger
Book ID: 6, Title: Brave New World, Author: Aldous Huxley
Book ID: 7, Title: Animal Farm, Author: George Orwell
Book ID: 8, Title: The Hobbit, Author: J.R.R. Tolkien
Book ID: 9, Title: Fahrenheit 451, Author: Ray Bradbury
Book ID: 10, Title: Jane Eyre, Author: Charlotte Brontë
