# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [24]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [25]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [26]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


In [36]:
for child in document_tree.getroot():
    c = child.find('city')
    print(c)

<Element 'city' at 0x10dd6c2c8>
None
<Element 'city' at 0x10dddf3b8>
<Element 'city' at 0x10dde4638>
<Element 'city' at 0x10ddc29a8>
<Element 'city' at 0x10be353b8>
<Element 'city' at 0x10be32228>


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [17]:
document = ET.parse( './data/mondial_database.xml' )

infant_mortality = []
for child in document.getroot():
    country = child.find('name').text
    try:
        mortality = child.find('infant_mortality').text
        infant_mortality.append((country, float(mortality)))
    except:
        pass
    

#print(infant_mortality)
top_ten = sorted(infant_mortality, key=lambda x: x[1])[:10]
for tup in top_ten:
    print(tup[0])

Monaco
Japan
Norway
Bermuda
Singapore
Sweden
Czech Republic
Hong Kong
Macao
Iceland


In [46]:
populations = []
for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        city = subelement.find('name').text
        population = 0
        try:
            population = int(subelement.findall('population')[-1].text)
        except:
            pass
        populations.append((city, population))

top_ten = sorted(populations, key=lambda x: x[1])[-10:]
for tup in top_ten:
    print(tup[0])

Shenzhen
Delhi
Guangzhou
Tianjin
São Paulo
Beijing
Moskva
Mumbai
Istanbul
Shanghai


In [62]:
ethnicity = {}

for element in document.iterfind('country'):
    country = element.find('name').text
    population = 0
    try:
        population = int(element.findall('population')[-1].text)
        ethnicgroups = element.findall('ethnicgroup')
        for ethnicgroup in ethnicgroups:
            group = ethnicgroup.text
            percentage = int(ethnicgroup.get('percentage'))
            group_pop = int((percentage * population) / 100)
            if group in ethnicity:
                ethnicity[group] = ethnicity[group] + group_pop
            else:
                ethnicity[group] = group_pop
    except:
        pass

from pprint import pprint
import operator
top_ten = dict(sorted(ethnicity.items(), key=operator.itemgetter(1), reverse=True)[:10])
pprint(sorted(list(top_ten.items()), key=lambda x: x[1], reverse=True))
    

[('Indo-Aryan', 871815583),
 ('Dravidian', 302713744),
 ('African', 222345317),
 ('Mestizo', 156241854),
 ('Bengali', 146776916),
 ('European', 121052971),
 ('Javanese', 113456006),
 ('Arab', 90906516),
 ('Eastern Hamitic', 82830376),
 ('Amerindian', 53060111)]


In [69]:
airports = []
rivers = []
lakes = []

for element in document.iterfind('airport'):
    try:
        country = element.get('country')
        name = element.find('name').text
        elevation = int(element.find('elevation').text)
        airports.append((country, name, elevation))
    except:
        pass
for element in document.iterfind('river'):
    try:
        country = element.get('country')
        name = element.find('name').text
        length = int(element.find('length').text)
        rivers.append((country, name, length))
    except:
        pass

for element in document.iterfind('lake'):
    try:
        country = element.get('country')
        name = element.find('name').text
        area = int(element.find('area').text)
        lakes.append((country, name, area))
    except:
        pass

highest_airport = sorted(airports, key=lambda x: x[2], reverse=True)[0]
longest_river = sorted(rivers, key=lambda x: x[2], reverse=True)[0]
largest_lake = sorted(lakes, key=lambda x: x[2], reverse=True)[0]

print(highest_airport, longest_river, largest_lake)

('BOL', 'El Alto Intl', 4063) ('CO BR PE', 'Amazonas', 6448) ('R AZ KAZ IR TM', 'Caspian Sea', 386400)
