# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

XML exercise
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find
1) 10 countries with the lowest infant mortality rates
2) 10 cities with the largest population
3) 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [46]:
from xml.etree import ElementTree as ET
document_tree = ET.parse( 'mondial_database.xml' ).getroot()

In [98]:
#Answer Q1
# Print names and infant mortality rates 
# of countries with bottom 10 infant mortality rates 

# This removes the countries without infant mortality rate
def countriesWithInfantMortalityDataAvailable(document_tree):
    countries = []
    for country in document_tree:
        if (not(country.find('infant_mortality') is None)):
            countries.append(country)
    return countries

# Remove the countries with infant mortality data
countries = countriesWithInfantMortalityDataAvailable(document_tree)

# Sort the countries in ascending order of infant mortality rate
sorted_countries = sorted(countries, key=lambda country: float(country.find('infant_mortality').text))

# Print the first 10 countries and their infant mortality rates
print('The ten countries with lowest infant mortality rates are:')
for country in sorted_countries[0:10]:
    print(country.find('name').text + ' ' + country.find('infant_mortality').text)

The ten countries with lowest infant mortality rates are:
Monaco 1.81
Japan 2.13
Norway 2.48
Bermuda 2.48
Singapore 2.53
Sweden 2.6
Czech Republic 2.63
Hong Kong 2.73
Macao 3.13
Iceland 3.15


In [105]:
#Ans Q2
# Iterates through all the cities in the tree
# and gets the latest available population data for each city
# If the population data is not available, it is assumed 0
def findAllCities(document_tree):
    cities = []
    for city in document_tree.iter('city'):
        new_city = {}
        new_city['name'] = city.find('name').text
        # If population data is not available, it is set to 0
        if (city.find('population') is None):
            new_city['population'] = '0'
            # Else get the latest available population data
        else:
            latest_population = city.find('population')
            for population in city.iterfind('population'):
                if (int(population.attrib['year']) > int(latest_population.attrib['year'])):
                    latest_population = population
            new_city['population'] = latest_population.text
        cities.append(new_city)
    return cities

# Find all cities with their population
cities = findAllCities(document_tree)
    
# Sort the cities in descending order of population
sorted_cities = sorted(cities, key=lambda city: int(city['population']), reverse = True)

# Print the first 10 cities and their populations
print("The ten cities with the largest population are:")
for city in sorted_cities[0:10]:
    print(city['name'] + ' ' + city['population'])

The ten cities with the largest population are:
Shanghai 22315474
Istanbul 13710512
Mumbai 12442373
Moskva 11979529
Beijing 11716620
São Paulo 11152344
Tianjin 11090314
Guangzhou 11071424
Delhi 11034555
Shenzhen 10358381


In [100]:
#Ans Q3
# Iterates through all the ethnic groups in the countries in the tree
# and gets the latest available population data for each ethnic groups for that country
# by multiplying the population percentage by the latest population data available
# If the population data is not available, it is assumed 0
def getCountryWiseEthnicGroupPopulations(document_tree):
    country_wise_ethnicgroup_populations = []
    for country in document_tree.iterfind('country'):
        country_population_distribution = {}
        country_population_distribution['name'] = country.find('name').text
        country_population = 0
        # If population data is not available, it is 0
        # Else get the latest available population data
        if not(country.find('population') is None):
            latest_population = country.find('population')
            for population in country.iterfind('population'):
                if (int(population.attrib['year']) > int(latest_population.attrib['year'])):
                    latest_population = population
            country_population = int(latest_population.text)
            
        # Get population for each ethnic group by multiplying the percentage
        # with the latest available population
        for ethnicgroup in country.iterfind('ethnicgroup'):
            percentage = float(ethnicgroup.attrib['percentage'])
            country_population_distribution[ethnicgroup.text] = int(percentage * country_population * 0.01)
        
        country_wise_ethnicgroup_populations.append(country_population_distribution)
        
    return country_wise_ethnicgroup_populations

country_wise_ethnic_group_populations = getCountryWiseEthnicGroupPopulations(document_tree)

# Get names of all the ethnic groups
ethnic_group_names = []
for element in document_tree.iter('ethnicgroup'):
    ethnic_group_name = element.text
    if not(ethnic_group_name in ethnic_group_names):
        ethnic_group_names.append(ethnic_group_name)


# Add all the populations by ethnic groups to get total population for each group        
ethnic_groups = []
for ethnic_group_name in ethnic_group_names:
    ethnic_group = {}
    ethnic_group['name'] = ethnic_group_name
    population = 0
    for country in country_wise_ethnic_group_populations:
        if (ethnic_group_name in country):
            population += country[ethnic_group_name]
    ethnic_group['population'] = population
    ethnic_groups.append(ethnic_group)

# Sort the ethnic groups in descending order of population
sorted_ethnic_groups = sorted(ethnic_groups, key=lambda ethnic_group: ethnic_group['population'], reverse = True)

# Print the first 10 ethnic groups and their population
print("The ten ethnic groups with the largest overall population are:")
for ethnic_group in sorted_ethnic_groups[0:10]:
    print(ethnic_group['name'] + ' ' + str(ethnic_group['population']))

The ten ethnic groups with the largest overall population are:
Han Chinese 1245058800
Indo-Aryan 871815583
European 494872201
African 318325104
Dravidian 302713744
Mestizo 157734349
Bengali 146776916
Russian 131856989
Japanese 126534212
Malay 121993548


In [97]:
def countryNameForCode(countrycode):
    for country in document_tree.iterfind('country'):
        if country.attrib['car_code'] == countrycode:
            return country.find('name').text

rivers = []
for river_element in document_tree.iter('river'):
    river = {}
    river['name'] = river_element.find('name').text
    length = 0
    if not(river_element.find('length') is None):
        length = float(river_element.find('length').text)
    river['length'] = length
    river['country'] = river_element.attrib['country']
    rivers.append(river)
    
longest_river = max(rivers, key=lambda river:river['length'])
print("The longest river is named " + longest_river['name'] + ".")
print("Its length is " + str(longest_river['length']) + ".")
print("It is located in the following countries:")
for countrycode in longest_river['country'].split(' '):
    print(countryNameForCode(countrycode))
print("\n")

lakes = []
for lake_element in document_tree.iter('lake'):
    lake = {}
    lake['name'] = lake_element.find('name').text
    lake_area = 0
    lake_element_area = lake_element.find('area')
    if not(lake_element_area is None):
        lake_area = float(lake_element_area.text)
    lake['area'] = lake_area
    lake['country'] = lake_element.attrib['country']
    lakes.append(lake)
    
largest_lake = max(lakes, key=lambda lake:lake['area'])
print("The largest lake is named " + largest_lake['name'] + ".")
print("Its area is " + str(largest_lake['area']) + ".")
print("It is located in the following countries:")
for countrycode in largest_lake['country'].split(' '):
    print(countryNameForCode(countrycode))
print("\n")

airports = []
for airport_element in document_tree.iter('airport'):
    airport = {}
    airport['name'] = airport_element.find('name').text
    airport_elevation = 0
    airport_element_elevation = airport_element.find('elevation').text
    if not(airport_element_elevation is None):
        airport_elevation = float(airport_element_elevation)
    airport['elevation'] = airport_elevation
    airport['country'] = airport_element.attrib['country']
    airports.append(airport)
    
airport_at_highest_elevation = max(airports, key=lambda airport:airport['elevation'])
print("The airport at highest elevation is named " + airport_at_highest_elevation['name'] + ".")
print("Its elevation is " + str(airport_at_highest_elevation['elevation']) + ".")
print("It is located in the following countries:")
for countrycode in airport_at_highest_elevation['country'].split(' '):
    print(countryNameForCode(countrycode))

The longest river is named Amazonas.
Its length is 6448.0.
It is located in the following countries:
Colombia
Brazil
Peru


The largest lake is named Caspian Sea.
Its area is 386400.0.
It is located in the following countries:
Russia
Azerbaijan
Kazakhstan
Iran
Turkmenistan


The airport at highest elevation is named El Alto Intl.
Its elevation is 4063.0.
It is located in the following countries:
Bolivia


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation