In [1]:
import xml.etree.ElementTree as ET

In [2]:
tree = ET.parse('data.xml')

In [3]:
print(type(tree))

<class 'xml.etree.ElementTree.ElementTree'>


In [4]:
root = tree.getroot()
root

<Element 'data' at 0x7fa31747bf90>

In [5]:
print(root.tag)
print(root.attrib)
print(len(root))

data
{}
3


In [56]:
print(type(root))

<class 'xml.etree.ElementTree.Element'>


In [6]:
# First child of the root
country1 = root[0]

In [7]:
country1

<Element 'country' at 0x7fa31747f040>

In [9]:
# First child of the child
rank = country1[0]

In [10]:
rank

<Element 'rank' at 0x7fa31747f090>

In [11]:
# What is the tag of the grandchild
print(rank.tag)

rank


In [12]:
print(rank.attrib)
print(len(rank))

{}
0


In [13]:
# What is the text inside this grandchild
print(rank.text)

1


In [14]:
# What are the attributes of last element?
print(country1[4].attrib)

{'name': 'Switzerland', 'direction': 'W'}


In [15]:
# Third child of the root
country3 = root[2]

In [16]:
# First child of the child
rank3 = country3[0]

In [17]:
# What is the tag of the grandchild
print(rank3.tag)

rank


In [18]:
# What is the text inside this grandchild
print(rank3.text)

68


In [19]:
# What are the attributes of last element?
print(country3[4].attrib)

{'name': 'Colombia', 'direction': 'E'}


In [20]:
# Find all child with tag country
for country in root.findall('country'):
    # rank is child of the country
    rank = country.find('rank').text
    # name is attribute of the country
    name = country.get('name')
    print(name, rank)

Liechtenstein 1
Singapore 4
Panama 68


In [21]:
for neighbor in root.iter('neighbor'):
    print(neighbor.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [22]:
# Top-level elements
root.findall(".")

[<Element 'data' at 0x7fa31747bf90>]

In [36]:
b = root.findall(".")
print(b)
print(type(b))
print(b[0].tag)
print(len(b[0]))
print(b[0][0])
print(b[0][0].get('name'))


[<Element 'data' at 0x7fa31747bf90>]
<class 'list'>
data
3
<Element 'country' at 0x7fa31747f040>
Liechtenstein


In [23]:
# All 'neighbor' grand-children of 'country' children of the top-level elements
root.findall("./country/neighbor")

[<Element 'neighbor' at 0x7fa31747f180>,
 <Element 'neighbor' at 0x7fa31747f1d0>,
 <Element 'neighbor' at 0x7fa31747f360>,
 <Element 'neighbor' at 0x7fa31747f4f0>,
 <Element 'neighbor' at 0x7fa31747f540>]

In [38]:
c = root.findall("./country/neighbor")
print(c)
print(type(c))
print(c[0].tag)
print(len(c[0]))
print(c[0].attrib)


[<Element 'neighbor' at 0x7fa31747f180>, <Element 'neighbor' at 0x7fa31747f1d0>, <Element 'neighbor' at 0x7fa31747f360>, <Element 'neighbor' at 0x7fa31747f4f0>, <Element 'neighbor' at 0x7fa31747f540>]
<class 'list'>
neighbor
0
{'name': 'Austria', 'direction': 'E'}


In [39]:
# elements with name='Singapore' that have a 'year' child
root.findall(".//year/..[@name='Singapore']")

[<Element 'country' at 0x7fa31747f220>]

In [55]:
d = root.findall(".//year/..[@name='Singapore']")
print(d)
print(type(d))
print(len(d))
print(d[0])
print(d[0].tag)
print(d[0].attrib)
print(d[0].attrib['name'])
print(d[0].find('year').text)

[<Element 'country' at 0x7fa31747f220>]
<class 'list'>
1
<Element 'country' at 0x7fa31747f220>
country
{'name': 'Singapore'}
Singapore
2011


In [47]:
# 'year' elements that are children of elements with name='Singapore'
root.findall(".//*[@name='Singapore']/year")

[<Element 'year' at 0x7fa31747f2c0>]

In [50]:
f = root.findall(".//*[@name='Singapore']/year")
print(f)
print(type(f))
print(len(f))
print(f[0])
print(f[0].tag)
print(f[0].attrib)
print(f[0].text)


[<Element 'year' at 0x7fa31747f2c0>]
<class 'list'>
1
<Element 'year' at 0x7fa31747f2c0>
year
{}
2011


In [51]:
# All 'neighbor' elements that are the second child of their parent
root.findall(".//neighbor[2]")

[<Element 'neighbor' at 0x7fa31747f1d0>,
 <Element 'neighbor' at 0x7fa31747f540>]

In [53]:
g = root.findall(".//neighbor[2]")
print(g)
print(type(g))
print(len(g))
print(g[0])
print(g[0].tag)
print(g[0].attrib)
print(g[1].attrib)

[<Element 'neighbor' at 0x7fa31747f1d0>, <Element 'neighbor' at 0x7fa31747f540>]
<class 'list'>
2
<Element 'neighbor' at 0x7fa31747f1d0>
neighbor
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [54]:
""" Extract the name, rank, year and gdppc from the countries and create a Pandas DataFrame. 
       Try to do it alone before checking the solution here."""

# My solution

import pandas as pd

Name = []
Rank = []
Year = []
GDPPC = []


for country in root.findall('country'):
    Name.append(country.get('name'))
    Rank.append(country.find('rank').text)
    Year.append(country.find('year').text)
    GDPPC.append(country.find('gdppc').text)
    
df = pd.DataFrame({'Name': Name, 'Rank': Rank, 'Year': Year, 'GDPPC': GDPPC})    
df

Unnamed: 0,Name,Rank,Year,GDPPC
0,Liechtenstein,1,2008,141100
1,Singapore,4,2011,59900
2,Panama,68,2011,13600


In [57]:
# Official Solution

import xml.etree.ElementTree as ET
import pandas as pd

tree = ET.parse('data.xml')  # Load from file
root = tree.getroot()

my_dict = {'name': [],
           'rank': [],
           'year': [],
           'gdppc': []}


for country in root:
    name_value = country.attrib['name']
    my_dict['name'].append(name_value)

    rank_value = country[0].text
    my_dict['rank'].append(rank_value)

    year_value = country[1].text
    my_dict['year'].append(year_value)

    gdppc_value = country[2].text
    my_dict['gdppc'].append(gdppc_value)

df = pd.DataFrame(my_dict) 
df

Unnamed: 0,name,rank,year,gdppc
0,Liechtenstein,1,2008,141100
1,Singapore,4,2011,59900
2,Panama,68,2011,13600
