# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [111]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [14]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [19]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [40]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [86]:
tree = ET.parse( './data/mondial_database.xml' )
root = tree.getroot()

### 10 countries with the lowest infant mortality rates

In [114]:
# create empty dictionary for country infant mortality rates
country_mortality_dict = {}

for country in tree.iterfind('country'):
    # pull out infant mortality rate associated with each country
    mortality = country.find('infant_mortality')
    
    # populate dict with country name and infant mortality rate
    if mortality is not None:
        country_mortality_dict[country.find('name').text] = float(mortality.text)

#create dataframe from dict. note that we must pass the DataFrame constructor a list of the dict items        
df = pd.DataFrame(list(country_mortality_dict.items()), columns=['name', 'infant_mortality'])

df.sort_values('infant_mortality').head(10)

Unnamed: 0,name,infant_mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


In [115]:
# another way to do exercise 1, but with lists instead of dicts

# create empty list for country infant mortality rates
country_mortality_list = []

for country in tree.iterfind('country'):
    
    # pull out infant mortality rate associated with each country
    mortality = country.find('infant_mortality')
    
    # populate list with sublists of country name and infant mortality rate
    if mortality is not None:
        country_mortality_list.append([country.find('name').text, float(mortality.text)])

# create dataframe from list        
df2 = pd.DataFrame(country_mortality_list, columns=['name', 'infant_mortality'])

df2.sort_values('infant_mortality').head(10)

Unnamed: 0,name,infant_mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


### 10 cities with the largest population

In [117]:
largest_pop_list = []

# iterate through tree to find all cities
for city in tree.findall('.//city'):
    
    # store name of city in var name
    name = city.find('name')
    if name is None:
        name = np.nan
    else:
        name = name.text
    
    # store population of city in 2011 in var population
    population = city.find('population[@year="2011"]')
    if population is None:
        population = np.nan
    else:
        population = int(population.text)
    
    # populate list with sublists of city name and population
    largest_pop_list.append([name, population])

# create dataframe out of list
df3 = pd.DataFrame(largest_pop_list, columns=['name', 'population_2011'])

df3.sort_values('population_2011', ascending=False).head(10)

Unnamed: 0,name,population_2011
1527,Mumbai,12442373.0
1582,Delhi,11034555.0
1515,Bangalore,8443675.0
1000,London,8250205.0
1382,Tehran,8154051.0
1470,Dhaka,7423137.0
1591,Hyderabad,6731790.0
1505,Ahmadabad,5577940.0
3056,Luanda,5000000.0
1556,Chennai,4646732.0


### 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [143]:
# create empty dict to store ethnic groups
ethnicgroups = dict()

# iterate through all countries
for country in tree.iterfind('country'):
    # create empty list to store population numbers
    populations = []
    
    # populate populations with tuples of the year and the population
    for population in country.findall('population'):
        populations.append((int(population.get('year')), int(population.text)))
    
    # set max population (# only, not the year)
    max_population = max(populations)[1]
    
    # populate ethnicgroups with population of corresponding group
    for ethnicgroup in country.findall('ethnicgroup'):
        # calculate population of ethnic group via percentage attribute and max population
        population_ethnic_group = (float(ethnicgroup.get('percentage')) * 
                                   max_population / 100)
        
        ethnicgroups[ethnicgroup.text] = ethnicgroups.setdefault(ethnicgroup.text, 0) + population_ethnic_group

# create dataframe out of ethnicgroups dict        
ethnicdf = pd.DataFrame(list(ethnicgroups.items()))

# create sorted dataframe
sorted_df = ethnicdf.sort_values([1], ascending=False).head(10)

# add column names
sorted_df.columns = ['ethnic_group', 'population']

print(sorted_df)

    ethnic_group           population
80   Han Chinese     1,245,058,800.00
106   Indo-Aryan       871,815,583.44
128     European       494,872,219.72
16       African       318,325,120.37
105    Dravidian       302,713,744.25
150      Mestizo       157,734,354.94
98       Bengali       146,776,916.72
33       Russian       131,856,996.08
139     Japanese       126,534,212.00
110        Malay       121,993,550.37


### name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [184]:
# create empty dictionary for country codes
code_to_country_dict = {}

# populate country code dict
for country in tree.iterfind('country'):
    code_to_country_dict[country.get('car_code')] = country.find('name').text

# populate rivers list with sublists of river names, length within the country, and country code
rivers = []

for river in tree.iterfind('river'):
    for country in river.get('country').split():
        length = river.find('length')
        
        if length is None:
            length = np.nan
        else:
            length = float(length.text)
        
        rivers.append([river.find('name').text, length, code_to_country_dict[country]])

# create a dataframe from the rivers list
river_df = pd.DataFrame(rivers, columns=['name', 'length', 'country'])

# populate lakes list with sublists of lake names, area within the country, and country code
lakes = []

for lake in tree.iterfind('lake'):
    for country in lake.get('country').split():
        area = lake.find('area')
        
        if area is None:
            area = np.nan
        else:
            area = float(area.text)
        
        lakes.append([lake.find('name').text, area, code_to_country_dict[country]])

# create a dataframe from the lakes list
lake_df = pd.DataFrame(lakes, columns=['name', 'area', 'country'])

# create an airport list with sublists of airport names, elevation within the country, and country code
airports = []

for airport in tree.iterfind('airport'):
    for country in airport.get('country').split():
        elevation = airport.find('elevation')
        
        if elevation is None or elevation.text is None:
            continue
        else:
            elevation = float(elevation.text)
            
        airports.append([airport.find('name').text, elevation, code_to_country_dict[country]])

# create a dataframe from the airports list
airport_df = pd.DataFrame(airports, columns=['name', 'elevation', 'country'])

# pull out longest river and accompanying variables and create series from data
max_river = river_df.loc[river_df['length'].idxmax()]
print('Longest river: \nName: %s, Country: %s, Area: %d' % (max_river['name'], max_river.country, max_river.length))

# pull out largest lake and accompanying variables and create series from data
max_lake = lake_df.loc[lake_df['area'].idxmax()]
print("\nLargest Lake: \nName: %s, Country: %s, Area: %d" % (max_lake['name'], max_lake.country, max_lake.area))

# pull out highest airport and accompanying variables and create series from data
max_airport = airport_df.loc[airport_df['elevation'].idxmax()]
print('\nHighest airport: \nName: %s, Country: %s, Elevation: %d' % (max_airport['name'], max_airport.country, max_airport.elevation))

Longest river: 
Name: Amazonas, Country: Colombia, Area: 6448

Largest Lake: 
Name: Caspian Sea, Country: Russia, Area: 386400

Highest airport: 
Name: El Alto Intl, Country: Bolivia, Elevation: 4063
