# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )

In [7]:
#1. 10 countries with the lowest infant mortality rates
country_im = []
for country in document.iterfind('country'):
    if country.find('infant_mortality') is not None:
        country_im.append([country.find('name').text,country.find('infant_mortality').text])
    

In [8]:
import pandas as pd
im = pd.DataFrame(country_im)
im.columns = ["country","infant_moratality"]

In [9]:
im.infant_moratality = im.infant_moratality.astype(float)
im = im.sort_values(by='infant_moratality')
im.head(10)

Unnamed: 0,country,infant_moratality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


In [10]:
#2. 10 cities with the largest population
populations = []
for country in document.iterfind('country'):
    if country.find('population') is not None:
        populations.append([country.find('name').text,country.find('population').text])
    

In [11]:
pop = pd.DataFrame(populations)
pop.columns = ["country","population"]

In [12]:
pop.population = pop.population.astype(float)
pop = pop.sort_values(by = "population")
pop.head(10)

Unnamed: 0,country,population
166,Pitcairn,68.0
83,Cocos Islands,628.0
41,Holy See,840.0
121,Cayman Islands,933.0
138,Sint Maarten,1497.0
170,Tokelau,1570.0
39,Gibraltar,1816.0
186,Falkland Islands,2043.0
159,Nauru,2066.0
52,Svalbard,2116.0


In [13]:
#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
data = [] 

for country in document.findall('country'):
    for node in list(country):
        if node.tag == 'name':
            co = node.text
        elif node.tag == 'population':
            # the last listed population statistic is used
            pop = int(node.text)
            #meas = node.attrib['measured'] --leads to an error, potentially unpopulated at times
            yr = int(node.attrib['year'])
        elif node.tag == 'ethnicgroup':
            eth = node.text
            perc = float(node.attrib['percentage'])
            epop = int(pop * perc / 100.)
            
            data.append({'countries':co, 'country_pop':pop, 'year':yr,
                        'ethnicity':eth, 'country_percentage':perc, 'population':epop})
    
df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,countries,country_percentage,country_pop,ethnicity,population,year
0,Albania,95.0,2800138,Albanian,2660131,2011
1,Albania,3.0,2800138,Greek,84004,2011
2,Greece,93.0,10816286,Greek,10059145,2011
3,Macedonia,64.2,2059794,Macedonian,1322387,2011
4,Macedonia,25.2,2059794,Albanian,519068,2011
5,Macedonia,3.9,2059794,Turkish,80331,2011
6,Macedonia,2.7,2059794,Gypsy,55614,2011
7,Macedonia,1.8,2059794,Serb,37076,2011
8,Serbia,82.9,7120666,Serb,5903032,2011
9,Serbia,0.9,7120666,Montenegrin,64085,2011


In [15]:
df.groupby('ethnicity').population.sum().sort_values(ascending=False).head(10).reset_index()

Unnamed: 0,ethnicity,population
0,Han Chinese,1245058800
1,Indo-Aryan,871815583
2,European,494872201
3,African,318325104
4,Dravidian,302713744
5,Mestizo,157734349
6,Bengali,146776916
7,Russian,131856989
8,Japanese,126534212
9,Malay,121993548


In [14]:
#4. name and country of a) longest river 

In [17]:
rivers_list=[]
rivers_df = pd.DataFrame()
for rivers in document.iterfind('river'):
    try:
        rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})
    except:
        next
rivers_df = pd.DataFrame(rivers_list)
rivers_df.sort_values(by = 'length', ascending=False).head(1)

Unnamed: 0,country,length,name
161,CO,6448,Amazonas


In [None]:
#b) largest lake

In [19]:
lake_list=[]
lake_df = pd.DataFrame()
for lakes in document.iterfind('lake'):
    try:
        lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})
    except:
        next
lakes_df = pd.DataFrame(lake_list)
lakes_df.sort_values(by = 'area', ascending=False).head(1)

Unnamed: 0,area,country,name
42,386400,R,Caspian Sea


In [None]:
#c) airport at highest elevation

In [20]:
ap_list=[]
ap_df = pd.DataFrame()
for ap in document.iterfind('airport'):
    try:
        ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})
    except:
        next
ap_df = pd.DataFrame(ap_list)
ap_df.sort('elevation', ascending=False).head(1)




Unnamed: 0,country,elevation,name
80,BOL,4063,El Alto Intl
