# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [30]:
from xml.etree import ElementTree as ET
import numpy as np
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [7]:
document = ET.parse( './data/mondial_database.xml' )

In [60]:
root = document.getroot()
root.tag

'mondial'

In [152]:
def find_cities(e):
    for elem in e.findall('city'):
        yield elem
    for p in e.findall('province'):
        for elem in p.findall('city'):
            yield elem

country_dict = {}
city_dict = {}
ethnic_list = {}

for country in root.iter('country'):
    country_name = country.find('name').text
    country_dict[country_name] = {}
    country_dict[country_name]['name'] = country_name
    try:
        country_dict[country_name]['infant_mortality'] = country.find('infant_mortality').text
    except:
        country_dict[country_name]['infant_mortality'] = np.nan
    
    country_dict[country_name]['area'] = country.attrib.get('area', np.nan)
    country_dict[country_name]['capital'] = country.attrib.get('capital', np.nan)
    
    # Populate country information in country_dict
    for elem in country.findall('ethnicgroup'):
        ethn = {}
        ethn_name = elem.text
        ethn['name'] = ethn_name
        ethn['percentage'] = elem.attrib['percentage']
        ethnic_dict[ethn_name] = ethn
        ethnic_dict[country_name]
        
    # Populate ethnic group information in ethn_dict
    for elem in find_cities(country):
        city = {}
        city_name = elem.find('name').text
        city['name'] = city_name
        population_years = []
        
        for e in elem.findall('population'):
            population_years.append(int(e.attrib['year']))
            key = 'population_' + e.attrib['year']
            city[key] = e.text
        try:
            latest_year = max(population_years)
            latest_year_key = 'population_' + str(latest_year)
            city['population_latest'] = city[latest_year_key]
        except:
            city['population_latest'] = np.nan
        
        city_dict[city_name] = city

In [153]:
#  10 countries with the lowest infant mortality rates
df = pd.DataFrame.from_dict(country_dict, orient='index')
df.infant_mortality = df.infant_mortality.astype(float)
df.sort('infant_mortality', ascending=False).head(10).infant_mortality



Western Sahara              145.82
Afghanistan                 117.23
Mali                        104.34
Somalia                     100.14
Central African Republic     92.86
Guinea-Bissau                90.92
Chad                         90.30
Niger                        86.27
Angola                       79.99
Burkina Faso                 76.80
Name: infant_mortality, dtype: float64

In [154]:
# 10 cities with the largest population
dfc = pd.DataFrame.from_dict(city_dict, orient='index')
dfc.population_latest = dfc.population_latest.astype(float)
dfc.sort('population_latest', ascending=False).head(10).population_latest



Shanghai     22315474.0
Istanbul     13710512.0
Mumbai       12442373.0
Moskva       11979529.0
Beijing      11716620.0
São Paulo    11152344.0
Tianjin      11090314.0
Guangzhou    11071424.0
Delhi        11034555.0
Shenzhen     10358381.0
Name: population_latest, dtype: float64

In [156]:
#  10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
dfe = pd.DataFrame.from_dict(ethnic_dict, orient='index')
dfe.head()

Unnamed: 0,percentage,name
Acholi,4.0,Acholi
Afar,1.7,Afar
African,97.0,African
African descent,50.0,African descent
African-white-Indian,90.0,African-white-Indian
