# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [21]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [22]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [23]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

SyntaxError: invalid syntax (<ipython-input-23-71a7702f86c3>, line 3)

In [None]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [24]:
import pandas as pd
from xml.etree import ElementTree as ET
import numpy as np


document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()

In [25]:
# Temp Data Frame 
country = ""
inf_mor = ""
d_frame = pd.DataFrame(columns = ["country","inf_mor"])
d_frame['inf_mor'] = d_frame['inf_mor'].astype(float)
d_frame

Unnamed: 0,country,inf_mor


In [26]:
# Get 10 countries with lowest infant mortality rates

for country in document.findall( 'country' ):
    print (country.tag,country.attrib)

country {'memberships': 'org-BSEC org-CEI org-CD org-SELEC org-CE org-EAPC org-EBRD org-EITI org-FAO org-IPU org-IAEA org-IBRD org-ICC org-ICAO org-ICCt org-Interpol org-IDA org-IFRCS org-IFC org-IFAD org-ILO org-IMO org-IMF org-IOC org-IOM org-ISO org-OIF org-ITU org-ITUC org-IDB org-MIGA org-NATO org-OSCE org-OPCW org-OAS org-OIC org-PCA org-UN org-UNCTAD org-UNESCO org-UNIDO org-UPU org-WCO org-WFTU org-WHO org-WIPO org-WMO org-UNWTO org-WTO', 'car_code': 'AL', 'capital': 'cty-Albania-Tirane', 'area': '28750'}
country {'memberships': 'org-AG org-BIS org-BSEC org-CD org-SELEC org-CE org-EMU org-EAPC org-EBRD org-ECB org-EIB org-CERN org-ESA org-EU org-FATF org-FAO org-IGAD org-IPU org-IAEA org-IBRD org-ICC org-ICAO org-ICJ org-ICCt org-Interpol org-IDA org-IEA org-IFRCS org-IFC org-IFAD org-IHO org-ILO org-IMO org-IMSO org-IMF org-IOC org-IOM org-OIF org-ITSO org-ITU org-ITUC org-MIGA org-NATO org-NEA org-NSG org-OECD org-OSCE org-OPCW org-OAS org-PCA org-UN org-UNCTAD org-UNESCO org

In [27]:
# Creating a DataFrame for easier sorting later
country_name = ""
infant_mortality = ""
d_frame = pd.DataFrame(columns = ["country_name","infant_mortality"])
d_frame['infant_mortality'] = d_frame['infant_mortality'].astype(float)
d_frame

for country in document.findall( 'country' ):
    for node in country.getiterator(): 
        if node.tag == 'name': #find country name tag
            if country_name == "":
                country_name = (node.text)
        if node.tag == 'infant_mortality': #find mortality rate tag
            infant_mortality = float(node.text)
    d_frame.loc[len(d_frame)] = [country_name,infant_mortality] #add country name and mortality rate to data frame
    country_name = ""
    
    
#sort data frame and find top 10 countries with lowest mortalitiy rates
d_frame.sort_values(by = 'infant_mortality').head(10)

Unnamed: 0,country_name,infant_mortality
38,Monaco,1.81
98,Japan,2.13
36,Norway,2.48
117,Bermuda,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [None]:
# top 10 Cities with largest population

In [28]:
cityname = ""
citypopulation = ""
document = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['CityName','Population']) #create data frame to hold country name and its popuplation
df['Population'] = df['Population'].astype(float)

#loop through country element to find city name and its population
for country in document.iterfind( 'country' ):
    for city in country.iter('city'): #find all cities within each country element
        cityname = city.find('name').text
        year = int(0)
        for node in city.iterfind('population'): #find all population elements with each city
            year = node.attrib['year'] #there are multiple population elements with different 'year' attribute
            if node.attrib['year'] >= year: #store the population number of the latest year
                citypopulation = int(node.text)
        df.loc[len(df)] = [cityname,citypopulation] #add city name and its population to data frame
        cityname = ""
        

#sort data frame to find 10 cities with largest population
df.sort_values(by = 'Population', ascending=False).head(10)

Unnamed: 0,CityName,Population
1341,Shanghai,22315474.0
771,Istanbul,13710512.0
1527,Mumbai,12442373.0
479,Moskva,11979529.0
1340,Beijing,11716620.0
2810,São Paulo,11152344.0
1342,Tianjin,11090314.0
1064,Guangzhou,11071424.0
1582,Delhi,11034555.0
1067,Shenzhen,10358381.0


In [None]:
# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)


In [29]:
# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
countryname = ""
countrypopulation = 0
ethnicpopulation = 0
document = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['Country','EthnicGroup','Population'])
df['Population'] = df['Population'].astype(float)

#loop through country element to find ethnic groups and its population
for country in document.iterfind( 'country' ):
    countryname = country.find('name').text #find country name
    year = int(0)
    countrycpopulation = int(0)
    for node in country.iterfind('population'): #find population of the country
        year = node.attrib['year']
        if node.attrib['year'] >= year: #find population of the latest year
            countrypopulation = (node.text) 
    ethnicname = None
    ethnicpopulation = 0
    for ethnic in country.iter('ethnicgroup'): #find all ethnic groups within the same country
        ethnicname = ethnic.text
        #compute each ethnic population: country population * ethnic group percentage
        ethnicpopulation = round(float(ethnic.attrib['percentage']) * 0.01 * int(countrypopulation))
        if ethnicname == None:
            ethnicname = countryname
            ethnicpopulation = countrypopulation
        df.loc[len(df)] = [countryname,ethnicname,ethnicpopulation] #store ethnic group population to data frame
    countryname = ""

#group ethnic group across all countries and sum them up to find top 10 ethnic groups and its total population
df.groupby('EthnicGroup').sum().sort_values(by = 'Population', ascending=False).head(10)

Unnamed: 0_level_0,Population
EthnicGroup,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


In [None]:
# name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [30]:
def find_element_with_max_attribute_value(root, element_string, attribute_string, return_element=False):
    
    largest = (0,0)
    
    for element in root.findall(element_string):
        
        element_name = element.find('name').text
        
        if element.find(attribute_string) is not None:
            if element.find(attribute_string).text is not None:
                try:
                    attribute_value = int(element.find(attribute_string).text)
                except:
                    try:
                        attribute_value = float(element.find(attribute_string).text)
                    except:
                        print ["Error: attribute could not be converted to a number", element.find(attribute_string).text]
                #print " -", element_name, attribute_value

            if attribute_value > largest[1]:
                largest = (element_name, attribute_value)

                if return_element:
                    largest = (largest[0], largest[1], element)

    return largest

longest_river = find_element_with_max_attribute_value(root, "river", "length", return_element=True)
print ("Longest river:", longest_river[0])
print ("Length:", longest_river[1])
print ("Source:", longest_river[2].find('source').attrib['country'])

print ()
largest_lake = find_element_with_max_attribute_value(root, "lake", "area", return_element=True)
print ("Largest lake:", largest_lake[0])
print ("Area:", largest_lake[1])
print ("Located:", largest_lake[2].find('located').attrib['country'])

print()
airport_at_highest_elevation = find_element_with_max_attribute_value(root, "airport", "elevation", return_element=True)
print ("Airport at highest elevation:", airport_at_highest_elevation[0])
print ("Elevation:", airport_at_highest_elevation[1])
print ("Located:", airport_at_highest_elevation[2].attrib['country'])

Longest river: Amazonas
Length: 6448
Source: PE

Largest lake: Caspian Sea
Area: 386400
Located: R

Airport at highest elevation: El Alto Intl
Elevation: 4063
Located: BOL
