# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [233]:
from xml.etree import ElementTree as ET

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [14]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
root = document_tree.getroot()

In [9]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [251]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [15]:
document = ET.parse( './data/mondial_database.xml' )
root= document.getroot()

In [257]:
#inspecting the data
for child in root.iter('country'):
    for grand in child:
        print(grand)

<Element 'name' at 0x1066df8b8>
<Element 'population' at 0x1066df868>
<Element 'population' at 0x1066df818>
<Element 'population' at 0x1066df7c8>
<Element 'population' at 0x1066df778>
<Element 'population' at 0x1066df728>
<Element 'population' at 0x1066df6d8>
<Element 'population' at 0x1066df688>
<Element 'population' at 0x1066df638>
<Element 'population' at 0x1066df5e8>
<Element 'population_growth' at 0x1066df598>
<Element 'infant_mortality' at 0x1066df548>
<Element 'gdp_total' at 0x1066df4f8>
<Element 'gdp_agri' at 0x1066df4a8>
<Element 'gdp_ind' at 0x1066df458>
<Element 'gdp_serv' at 0x1066df408>
<Element 'inflation' at 0x1066df3b8>
<Element 'unemployment' at 0x1066df368>
<Element 'indep_date' at 0x1066df318>
<Element 'government' at 0x1066df2c8>
<Element 'encompassed' at 0x1066df278>
<Element 'ethnicgroup' at 0x1066df228>
<Element 'ethnicgroup' at 0x1066df1d8>
<Element 'religion' at 0x1066df188>
<Element 'religion' at 0x1066df138>
<Element 'religion' at 0x1066df0e8>
<Element 'langu

# Find 10 countries with the lowest infant mortality rates

In [308]:
data = []
#search through all the countries
for child in root.findall('country'):
    #find the country name
    if child.find('name') != None:
        name = child.find('name')
    #find the country's infant mortality rate
    if child.find('infant_mortality') != None:
        infant_mort = child.find('infant_mortality')
    #find the population.  the highest population should correspond to most recent
    if child.findall('population') != None:
        poplist = []
        for pop in child.findall('population'):
            poplist.append(float(pop.text))
        population = max(poplist)
    data.append([name.text, float(infant_mort.text),population])

    
country_data= pd.DataFrame(data, columns = ['Name','Infant Mortality','Population'])
country_data.sort_values('Infant Mortality',ascending=False).head(10)

Unnamed: 0,Name,Infant Mortality,Population
194,Western Sahara,145.82,554795.0
54,Afghanistan,117.23,26023100.0
189,Mali,104.34,14517176.0
226,Somalia,100.14,9636173.0
213,Central African Republic,92.86,4349921.0
230,Guinea-Bissau,90.92,1586624.0
214,Chad,90.3,11720781.0
192,Niger,86.27,17138707.0
195,Angola,79.99,24383301.0
201,Burkina Faso,76.8,17322796.0


In [412]:
data = []
for country in root.findall('country'):
    for city in country.findall('city'):
        #find country name
        country_name= country.find('name').text
        #find the city name
        if city.find('name') != None:
            city_name = city.find('name').text
        #find the city population
        if city.findall('population') != None:
            poplist = []
            for pop in city.findall('population'):
                poplist.append(float(pop.text))
            if np.size(poplist) != 0:
                population = max(poplist)
        data.append([city_name,country_name,population])
city_data = pd.DataFrame(data, columns=['name','country','population'])
city_data.sort_values('population',ascending=False).head(10)
            

Unnamed: 0,name,country,population
176,Seoul,South Korea,10229262.0
164,Al Qahirah,Egypt,8471859.0
80,Bangkok,Thailand,7506700.0
129,Macao,Macao,7055071.0
128,Hong Kong,Hong Kong,7055071.0
92,Ho Chi Minh,Vietnam,5968384.0
212,Singapore,Singapore,5076700.0
163,Al Iskandariyah,Egypt,4123869.0
216,New Taipei,Taiwan,3939305.0
177,Busan,South Korea,3813814.0
