# Python XML with ElementTree
Tutorial from [datacamp][df1]

[df1]: <https://www.datacamp.com/tutorial/python-xml-elementtree>

In [10]:
import xml.etree.ElementTree as ET

In [14]:
path = "test_xml.xml"
path

'test_xml.xml'

## Parsing XML Data

In [28]:
tree = ET.parse(path)

root = tree.getroot()
root

<Element 'collection' at 0x0000022A98AE71A0>

In [29]:
print(ET.tostring(root, encoding="utf8").decode('utf8'))

<?xml version='1.0' encoding='utf8'?>
<collection>
    <genre category="Action">
        <decade years="1980s">
            <movie favorite="True" title="Indiana Jones: The raiders of the lost Ark">
                <format multiple="No">DVD</format>
                <year>1981</year>
                <rating>PG</rating>
                <description>
                'Archaeologist and adventurer Indiana Jones 
                is hired by the U.S. government to find the Ark of the 
                Covenant before the Nazis.'
                </description>
            </movie>
               <movie favorite="True" title="THE KARATE KID">
               <format multiple="Yes">DVD,Online</format>
               <year>1984</year>
               <rating>PG</rating>
               <description>None provided.</description>
            </movie>
            <movie favorite="False" title="Back 2 the Future">
               <format multiple="False">Blu-ray</format>
               <year>1985</year>
  

In [30]:
root.tag

'collection'

In [31]:
root.attrib

{}

In [32]:
for child in root:
    print(child.tag, child.attrib)

genre {'category': 'Action'}
genre {'category': 'Thriller'}
genre {'category': 'Comedy'}


In [33]:
for movie in root.iter('movie'):
    print(movie.attrib)

{'favorite': 'True', 'title': 'Indiana Jones: The raiders of the lost Ark'}
{'favorite': 'True', 'title': 'THE KARATE KID'}
{'favorite': 'False', 'title': 'Back 2 the Future'}
{'favorite': 'False', 'title': 'X-Men'}
{'favorite': 'True', 'title': 'Batman Returns'}
{'favorite': 'False', 'title': 'Reservoir Dogs'}
{'favorite': 'False', 'title': 'ALIEN'}
{'favorite': 'True', 'title': "Ferris Bueller's Day Off"}
{'favorite': 'FALSE', 'title': 'American Psycho'}
{'favorite': 'False', 'title': 'Batman: The Movie'}
{'favorite': 'True', 'title': 'Easy A'}
{'favorite': 'False', 'title': 'Ghostbusters'}
{'favorite': 'True', 'title': 'Robin Hood: Prince of Thieves'}


# Xpath expressions

In [34]:
for description in root.iter('description'):
    print(description.text.strip())

'Archaeologist and adventurer Indiana Jones 
                is hired by the U.S. government to find the Ark of the 
                Covenant before the Nazis.'
None provided.
Marty McFly
Two mutants come to a private academy for their kind whose resident superhero team must 
               oppose a terrorist organization with similar powers.
NA.
WhAtEvER I Want!!!?!
"""""""""
Funny movie about a funny guy
psychopathic Bateman
What a joke!
Tim (Rudd) is a rising executive who “succeeds” in finding the perfect guest, IRS employee Barry (Carell), for his boss’ monthly event,
Who ya gonna call?
Robin Hood slaying


### Understanding XPath is critically important to scanning and populating XMLs. ElementTree has a .findall() function that will traverse the immediate children of the referenced element. You can use XPath expressions to specify more useful searches.

In [43]:
for movie in root.findall("./genre/decade/movie/[year='1992']"):
    print(movie)
    print(movie.attrib['title'])
    print()

<Element 'movie' at 0x0000022A9934CCC0>
Batman Returns

<Element 'movie' at 0x0000022A9934CE50>
Reservoir Dogs



### You can even search on attributes!

In [48]:
for format_ in root.findall("./genre/decade/movie/format/[@multiple='Yes']"):
    print(format_.text)

DVD,Online
dvd, digital
DVD
DVD,VHS


## Tip: use '...' inside of XPath to return the parent element of the current element.

In [56]:
for movie in root.findall("./genre/decade/movie/format/[@multiple='Yes']..."):
    print(f"{movie.tag}: {movie.attrib}")

movie: {'favorite': 'True', 'title': 'THE KARATE KID'}
movie: {'favorite': 'False', 'title': 'X-Men'}
movie: {'favorite': 'False', 'title': 'ALIEN'}
movie: {'favorite': 'False', 'title': 'Batman: The Movie'}


In [68]:
for decade in root.findall("./genre/decade/movie/format/[@multiple='Yes']....."):
    print(f"{decade.tag}: {decade.attrib}")

decade: {'years': '1980s'}
decade: {'years': '1990s'}
decade: {'years': '1970s'}
decade: {'years': '1960s'}


In [70]:
for genre in root.findall("./genre/decade/movie/format/[@multiple='Yes']......"):
    print(f"{genre.tag}: {genre.attrib}")

genre: {'category': 'Action'}
genre: {'category': 'Thriller'}
genre: {'category': 'Comedy'}


In [167]:
for genre in root.findall("./genre/decade/movie"):
    print(f"{genre.tag}: {genre.attrib}")

movie: {'favorite': 'True', 'title': 'Indiana Jones: The raiders of the lost Ark'}
movie: {'favorite': 'True', 'title': 'THE KARATE KID'}
movie: {'favorite': 'False', 'title': 'Back 2 the Future'}
movie: {'favorite': 'False', 'title': 'X-Men'}
movie: {'favorite': 'True', 'title': 'Batman Returns'}
movie: {'favorite': 'False', 'title': 'Reservoir Dogs'}
movie: {'favorite': 'False', 'title': 'ALIEN'}
movie: {'favorite': 'True', 'title': "Ferris Bueller's Day Off"}
movie: {'favorite': 'FALSE', 'title': 'American Psycho'}
movie: {'favorite': 'False', 'title': 'Batman: The Movie'}
movie: {'favorite': 'True', 'title': 'Easy A'}
movie: {'favorite': 'False', 'title': 'Ghostbusters'}
movie: {'favorite': 'True', 'title': 'Robin Hood: Prince of Thieves'}


# Using iterparse
```python
iterparse()
```

In [160]:
gs = []
movies = []

for event, elem in ET.iterparse(path, events=("start", "end")):
    t_name = elem.tag
    if event == 'start':
        if t_name == 'genre':
            movie = []
            genre = elem.attrib['category']
            gs.append(genre)
        elif t_name == 'movie':
            title = elem.get('title')
            movie.append(title)
            
    elif t_name == 'genre':
        movies.append(movie)


In [161]:
gs, movies

(['Action', 'Thriller', 'Comedy'],
 [['Indiana Jones: The raiders of the lost Ark',
   'THE KARATE KID',
   'Back 2 the Future',
   'X-Men',
   'Batman Returns',
   'Reservoir Dogs'],
  ['ALIEN', "Ferris Bueller's Day Off", 'American Psycho'],
  ['Batman: The Movie',
   'Easy A',
   'Ghostbusters',
   'Robin Hood: Prince of Thieves']])

In [162]:
for gen, m in zip(gs, movies):
    print(f"{gen}: {m}")

Action: ['Indiana Jones: The raiders of the lost Ark', 'THE KARATE KID', 'Back 2 the Future', 'X-Men', 'Batman Returns', 'Reservoir Dogs']
Thriller: ['ALIEN', "Ferris Bueller's Day Off", 'American Psycho']
Comedy: ['Batman: The Movie', 'Easy A', 'Ghostbusters', 'Robin Hood: Prince of Thieves']


In [163]:
d = {gen: m for gen, m in zip(gs, movies)}
d

{'Action': ['Indiana Jones: The raiders of the lost Ark',
  'THE KARATE KID',
  'Back 2 the Future',
  'X-Men',
  'Batman Returns',
  'Reservoir Dogs'],
 'Thriller': ['ALIEN', "Ferris Bueller's Day Off", 'American Psycho'],
 'Comedy': ['Batman: The Movie',
  'Easy A',
  'Ghostbusters',
  'Robin Hood: Prince of Thieves']}

In [None]:
gs = []
movies = []

for event, elem in ET.iterparse(path, events=("start", "end")):
    t_name = elem.tag
    if event == 'start':
        if t_name == 'genre':
            movie = []
            genre = elem.attrib['category']
            gs.append(genre)
        elif t_name == 'movie':
            title = elem.get('title')
            movie.append(title)
            
    elif t_name == 'genre':
        movies.append(movie)