# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )
import pandas as pd
from numpy import nan

### 1. Countries with the lowest infant mortality rates:

In [6]:
# Build a dataframe of countries and their mortality rates
countries = pd.DataFrame()
for element in document.iterfind('country'):
    c = element.get('car_code')       # indexing this frame by country code will be convenient later
    n = element.find('name').text
    m = element.find('infant_mortality')
    if m is not None:
        m = m.text
    else:
        m = nan
    newrow = pd.DataFrame({'country':n,
                           'infant_mortality':float(m)},
                           index=[c])
    countries = pd.concat([countries, newrow])

# Sort and display
countries.sort_values('infant_mortality').head(10)

Unnamed: 0,country,infant_mortality
MC,Monaco,1.81
J,Japan,2.13
BERM,Bermuda,2.48
N,Norway,2.48
SGP,Singapore,2.53
S,Sweden,2.6
CZ,Czech Republic,2.63
HONX,Hong Kong,2.73
MACX,Macao,3.13
IS,Iceland,3.15


### 2.   Cities with the largest populations:

In [7]:
# Build a dataframe of cities and their populations
cities = pd.DataFrame()
i = 0
for element in document.iterfind('country'):
    n = element.find('name').text
    for subelement in element.getiterator('city'):
        c = subelement.find('name').text      # city name
        p = subelement.findall('population')   
        if len(p) > 0 and p is not None:
            p = p[-1].text     # latest city population
        else:
            p = nan
        newrow = pd.DataFrame({'country':n,
                               'city':c,
                               'population':float(p)},index=[i])
        cities = pd.concat([cities,newrow])
        i = i+1

# Sort and display        
cities.sort_values('population', ascending=False).head(10)

Unnamed: 0,city,country,population
1341,Shanghai,China,22315474.0
771,Istanbul,Turkey,13710512.0
1527,Mumbai,India,12442373.0
479,Moskva,Russia,11979529.0
1340,Beijing,China,11716620.0
2810,São Paulo,Brazil,11152344.0
1342,Tianjin,China,11090314.0
1064,Guangzhou,China,11071424.0
1582,Delhi,India,11034555.0
1067,Shenzhen,China,10358381.0


### 3.   Ethnic groups with the largest overall populations:

In [8]:
# Build a dataframe of ethnic groups by country, and their populations
groups = pd.DataFrame()
i = 0
for element in document.iterfind('country'):
    n = element.find('name').text
    p = element.findall('population')[-1].text  # latest total population of country
    for subelement in element.getiterator('ethnicgroup'):
        g = subelement.text                      # group name
        perc = subelement.attrib['percentage']   # group's percentage in this country
        pop = float(perc) / 100 * float(p)       # group's population in this country
        newrow = pd.DataFrame({'country':n,
                               'ethnicgroup':g,
                               'population':pop},index=[i])
        groups = pd.concat([groups,newrow])
        i = i+1

# Sum across countries, sort, and display  
groups.groupby('ethnicgroup').sum().sort_values('population',ascending=False).head(10)

Unnamed: 0_level_0,population
ethnicgroup,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


### 4(a). Name and country of longest river:

In [9]:
longest = 0.0
for element in document.iterfind('river'):
    l = element.find('length')
    if l is not None:
        l = float(l.text)
    else:
        continue
    if l > longest:
        longest = l
        name = element.find('name').text
        code = element.find('source').attrib['country']

print('Longest river: ' + name + '\nSource location: ' + countries.loc[code,'country'])

Longest river: Amazonas
Source location: Peru


### 4(b). Name and country of largest lake:

In [10]:
largest = 0.0
for element in document.iterfind('lake'):
    a = element.find('area')
    if a is not None:
        a = float(a.text)
    else:
        continue
    if a > largest:
        largest = a
        name = element.find('name').text
        code = element.find('located').attrib['country']

print('Largest lake: ' + name + '\nLocation: ' + countries.loc[code,'country'])

Largest lake: Caspian Sea
Location: Russia


### 4(c). Name and country of airport at highest elevation:

In [11]:
highest = 0.0
for element in document.iterfind('airport'):
    h = element.find('elevation')
    if h.text is not None:
        h = float(h.text)
    else:
        continue
    if h > highest:
        highest = h
        name = element.find('name').text
        code = element.get('country')

print('Highest airport: ' + name + '\nLocation: ' + countries.loc[code,'country'])

Highest airport: El Alto Intl
Location: Bolivia
