# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [5]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )

In [26]:
for child in document.findall('country'):
    if child.find('infant_mortality') is not None:
        print(child.find('infant_mortality').text)
    else :
        print(child.find('name').text)

13.19
4.78
7.9
6.16
Montenegro
Kosovo
3.69
3.31
3.33
4.16
2.63
3.46
5.09
3.31
4.33
5.35
4.04
3.73
3.64
7.91
6
6.19
8.1
7.08
4.18
4.28
3.66
5.84
5.87
15.08
10.16
21.43
4.1
6.7
5.71
3.36
2.48
2.6
1.81
6.29
3.47
Holy See
Ceuta
Melilla
3.15
3.74
4.52
3.86
3.59
4.17
12.93
4.48
Svalbard
4.44
117.23
14.79
39
57.48
35.03
38.13
19.84
13.97
16.68
26.67
9.68
45.67
44.91
43.19
37.89
10.48
13.69
54.53
9.86
51.36
18.99
21.61
24.5
28.71
2.73
3.13
23.15
40.43
Christmas Island
Cocos Islands
8.54
30.6
3.98
22.41
25.16
38.79
39.67
37.53
15.73
7.51
14.58
15.79
7.98
28.6
2.13
3.93
24.59
14
10.92
50.41
17.64
6.42
2.53
9.02
4.49
3.4
13.29
11.74
12.5
10.93
20.31
23.51
12.58
2.48
13.45
4.71
6.17
6.21
8.7
20.36
10.7
4.7
11.61
19.63
49.43
18.44
18.72
9.42
10.5
8.3
13.69
7.1
13.66
Curacao
9.05
Saint Martin
Saint Barthelemy
15.02
7.73
8.98
11.75
6.95
13.07
24.82
10.97
6.79
8.92
4.43
14.33
10.2
4.78
5.51
35.37
21.39
21.93
8.21
5.46
4.59
Niue
Norfolk Island
5.5
11.46
Pitcairn
16.17
12.36
31.69
Tokelau
16.41
4.49
20.

In [28]:
import pandas as pd

In [62]:
# lowest mortality rates
im = pd.DataFrame(columns = ['im'])
for child in document.findall('country'):
    if child.find('infant_mortality') is not None:
        im.ix[child.find('name').text] = float(child.find('infant_mortality').text)
    else :
        im.ix[child.find('name').text] = None
im.sort_values(by = 'im').head(10)

Unnamed: 0,im
Monaco,1.81
Japan,2.13
Bermuda,2.48
Norway,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


In [85]:
# highest population cities
pop = pd.DataFrame(columns = ['pop'])
for country in document.findall('country'):
    for city in country.findall('city'):
        for child in city.findall('population') :
            # only using measurements from this decade
            if int(child.attrib['year']) >= 2010 :
                if child is not None:
                    pop.ix[city.find('name').text + " (" + child.attrib['year'] + ")"] = int(child.text)
                else :
                    pop.ix[city.find('name').text] = None
pop.sort_values(by = 'pop', ascending = False).head(10)

Unnamed: 0,pop
Seoul (2010),9708483.0
Singapore (2010),5076700.0
New Taipei (2012),3939305.0
Busan (2010),3403135.0
Kaohsiung (2012),2778659.0
Santo Domingo (2010),2749703.0
Taichung (2012),2684893.0
Taipei (2012),2673226.0
Incheon (2010),2637652.0
Daegu (2010),2444085.0


In [87]:
# best estimates for each country's population
cpop = pd.DataFrame(columns = ['pop'])
for country in document.findall('country') :
    best = None
    for p in country.findall('population') :
        best = p
    cpop.ix[country.find('name').text] = int(best.text) 

Unnamed: 0,pop
Albania,2800138.0
Greece,10816286.0
Macedonia,2059794.0
Serbia,7120666.0
Montenegro,620029.0


In [115]:
# highest population ethnic groups
eth = pd.DataFrame(columns = ['pop'])
for country in document.findall('country'):
    for eg in country.findall('ethnicgroup'):
        if eg.text in eth.index:
            eth.ix[eg.text] += float(eg.attrib['percentage']) * .01 * cpop.ix[country.find('name').text]
        else :
            eth.ix[eg.text] = float(eg.attrib['percentage']) * .01 * cpop.ix[country.find('name').text]
eth.sort_values(by = 'pop', ascending = False).head(10)

Unnamed: 0,pop
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


In [122]:
# longest river
lrn = ""
lrl = 0
lrc = ""
for river in document.findall('river'):
    if (river.find('length') is not None):
        if(float(river.find('length').text) > lrl):
            lrn = river.find('name').text
            lrl = float(river.find('length').text)
            lrc = river.attrib['country']
print("Longest River: " + lrn + " of " + lrc)

Longest River: Amazonas of CO BR PE


In [124]:
# largest lake
lln = ""
lla = 0
llc = ""
for lake in document.findall('lake'):
    if (lake.find('area') is not None):
        if(float(lake.find('area').text) > lla):
            lln = lake.find('name').text
            lla = float(lake.find('area').text)
            llc = lake.attrib['country']
print("Largest Lake: " + lln + " of " + llc)

Largest Lake: Caspian Sea of R AZ KAZ IR TM


In [126]:
# highest airport
han = ""
hae = 0
hac = ""
for air in document.findall('airport'):
    if (air.find('elevation').text is not None):
        if(float(air.find('elevation').text) > hae):
            han = air.find('name').text
            hae = float(air.find('elevation').text)
            hac = air.attrib['country']
print("Highest Airport: " + han + " of " + hac)

Highest Airport: El Alto Intl of BOL
