# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [7]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [60]:
# type(document_tree)

# countries = document_tree.findall('country')
# # help(countries[0].find('infant_mortality'))
# for country in countries:
#     print country.find('name').text
#     print (float(country.find('infant_mortality').text) if country.find('infant_mortality') is not None else 100.0)

In [67]:
# 1. 10 countries with the lowest infant mortality rates

document_tree = ET.parse( './data/mondial_database.xml' )

def getImmortalityAscendingkey(countryElem):
    return float(countryElem.find('infant_mortality').text) if countryElem.find('infant_mortality') is not None else 100.0

def sortCountriesByImmortality(root):
    root[:] = sorted(root, key=lambda country:getImmortalityAscendingkey(country))

root = document_tree.getroot()
sortCountriesByImmortality(root)

countries = root.findall('country')
# help(countries[0].find('infant_mortality'))
for country in countries[:10]:
    print country.find('name').text, ':', float(country.find('infant_mortality').text)


Monaco : 1.81
Japan : 2.13
Norway : 2.48
Bermuda : 2.48
Singapore : 2.53
Sweden : 2.6
Czech Republic : 2.63
Hong Kong : 2.73
Macao : 3.13
Iceland : 3.15


In [81]:
# 2. 10 cities with the largest population

def getPopulationByDescending(cityElem):
    return int(cityElem.find('population').text) if cityElem.find('population') is not None else 0

cities = root.findall('country/city')
# help(cities.sort)
cities.sort(key=lambda city: getPopulationByDescending(city), reverse=True)

# help(countries[0].find('infant_mortality'))
for city in cities[:10]:
    print city.find('name').text, ':', int(city.find('population').text if city.find('population') is not None else 0)


Seoul : 10229262
Hong Kong : 7055071
Al Qahirah : 6053000
Bangkok : 5876000
Ho Chi Minh : 3924435
Busan : 3813814
New Taipei : 3722082
Hanoi : 3056146
Al Iskandariyah : 2917000
Taipei : 2626138


In [20]:
# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
document_tree = ET.parse( './data/mondial_database.xml' )
root = document_tree.getroot()

def getPopulationByDescendingYear(population):
    return int(population.get('year'))

countries = root.findall('country')

ethnicGroupPopulationDict={}

for country in countries:
    
    populations = country.findall('population')
    populations.sort(key=lambda population: getPopulationByDescendingYear(population), reverse=True)
    # Use the latest available population as the current estimate     
    countryPopulation = int(populations[0].text)
    
#     print country.find('name').text, ':', ':', populations[0].text
    ethnicGroups = country.findall('ethnicgroup')
    for ethnicGroup in ethnicGroups:
        ethnicGroupPopulationInCountry = float(ethnicGroup.get('percentage')) * (countryPopulation/100)
#         print ethnicGroup.text, ':', ethnicGroup.get('percentage'), ':', ethnicGroupPopulationInCountry
        if ethnicGroup.text in ethnicGroupPopulationDict:
            ethnicGroupPopulationDict[ethnicGroup.text].append(ethnicGroupPopulationInCountry)
        else:
            ethnicGroupPopulationDict[ethnicGroup.text] = [ethnicGroupPopulationInCountry]

ethnicGroupPopulationDict

overallEthnicGroupPopulationDict = {}

for key,lis in ethnicGroupPopulationDict.items():
    overallEthnicGroupPopulationDict[key] = sum(lis)

overallEthnicGroupPopulationDict
sorted(overallEthnicGroupPopulationDict, key=overallEthnicGroupPopulationDict.get, reverse=True)[:10]


['Han Chinese',
 'Indo-Aryan',
 'European',
 'African',
 'Dravidian',
 'Mestizo',
 'Bengali',
 'Russian',
 'Japanese',
 'Malay']

In [29]:
# 4. name and country of a) longest river
def getRiverLength(river):
    return float(river.find('length').text) if river.find('length') is not None else 0.0
rivers = root.findall('river')
rivers.sort(key=lambda river: getRiverLength(river), reverse=True)
rivers[0].get('country')

'CO BR PE'

In [32]:
# 4. name and country of  b) largest lake
def getLakeArea(lake):
    return float(lake.find('area').text) if lake.find('area') is not None else 0.0
lakes = root.findall('lake')
lakes.sort(key=lambda river: getLakeArea(river), reverse=True)
lakes[0].get('country')



'R AZ KAZ IR TM'

In [37]:
# 4. name and country of c) airport at highest elevation
def getAirportElevation(airport):
#     print airport.find('name').text
    return float(airport.find('elevation').text) if airport.find('elevation').text is not None else 0.0

airports = root.findall('airport')
airports.sort(key=lambda airport: getAirportElevation(airport), reverse=True)
airports[0].get('country')


'BOL'