# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [12]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [13]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

SyntaxError: invalid syntax (<ipython-input-13-6645883cfe43>, line 3)

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [69]:
#1) 10 countries with the lowest infant mortality rates
import pandas as pd
document = ET.parse( 'mondial_database.xml' )
df = pd.DataFrame(columns=['Country','InfantMortality'])
for element in document.iterfind('country'):
    for subelement in element.getiterator('infant_mortality'):
        df.loc[-1] = [element.find('name').text, subelement.text]  # adding a row
        df.index = df.index + 1  # shifting index
df = df.sort('InfantMortality')
df.head(10)



Unnamed: 0,Country,InfantMortality
191,Monaco,1.81
199,Romania,10.16
85,Fiji,10.2
164,Brunei,10.48
103,Grenada,10.5
6,Mauritius,10.59
111,Panama,10.7
0,Seychelles,10.77
133,United Arab Emirates,10.92
122,Barbados,10.93


In [70]:
#2)10 cities with the largest population
import numpy as np
df = pd.DataFrame(columns=['City','Country','Population'])
for cityCountry in document.getiterator('city'):
    dict = {}
    countryName = cityCountry.get('country')
    populationCheck = cityCountry.find('population')
    if populationCheck is not None:
        for popul in cityCountry.getiterator('population'):
            measuredOrCensus = popul.get('measured')
            if measuredOrCensus == "census":
                dict[popul.get('year')] = popul.text
    if len(dict) != 0:
        df.loc[-1] = [cityCountry.find('name').text, countryName, dict[max(dict)]]  # adding a row
        df.index = df.index + 1  # shifting index  
df['Population'] = df['Population'].astype(str).astype(int)
df = df.sort('Population', ascending=False)
df.head(10)



Unnamed: 0,City,Country,Population
1585,Shanghai,CN,22315474
1457,Mumbai,IND,12442373
1586,Beijing,CN,11716620
1997,Moskva,R,11612885
393,São Paulo,BR,11152344
1584,Tianjin,CN,11090314
1629,Guangzhou,CN,11071424
1432,Delhi,IND,11034555
1627,Shenzhen,CN,10358381
1617,Wuhan,CN,9785388


In [71]:
# 3) 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
import numpy as np
df = pd.DataFrame(columns=['Ethnic Group','Population'])
dictEthnic = {}
for country in document.getiterator('country'):
    dictPopulation = {}
    populationCheck = country.find('population')
    if populationCheck is not None:
        for popul in country.getiterator('population'):
            measuredOrCensus = popul.get('measured')
            if measuredOrCensus == "census":
                dictPopulation[popul.get('year')] = popul.text
    if len(dictPopulation) != 0:
        ethnicCheck = country.find('ethnicgroup')
        if ethnicCheck is not None:
             for ethnic in country.getiterator('ethnicgroup'):
                    if ethnic.text in dictEthnic:
                        existingValue = dictEthnic[ethnic.text]
                        existingValue = float(existingValue)
                        percentage = ethnic.get('percentage')
                        percentage = float(percentage)
                        NewValue = existingValue + (float(dictPopulation[max(dictPopulation)]) *(percentage/100))
                        dict = {ethnic.text:NewValue}
                        dictEthnic.update(dict)
                    else:
                        percentage = ethnic.get('percentage')#.astype(str).astype(int)
                        percentage = float(percentage)
                        NewValue = (float(dictPopulation[max(dictPopulation)]) *(percentage/100))
                        dictEthnic[ethnic.text] = NewValue
for k,v in dictEthnic.items():
    df.loc[-1] = [k, v]  # adding a row
    df.index = df.index + 1  # shifting index  
df = df.sort('Population', ascending=False)
df.head(10)



Unnamed: 0,Ethnic Group,Population
221,Malay,89244340.0
260,Thai,49490040.0
173,Polish,37678960.0
225,Burman,34965210.0
13,African,30106140.0
182,Arab,28814750.0
262,Chinese,26693940.0
229,Taiwanese,18732780.0
25,Sinhalese,14995160.0
97,Arab-Berber,14024790.0


In [72]:
#4 a) name and country of a) longest river
df = pd.DataFrame(columns=['River Name','Country','Length'])
for river in document.getiterator('river'):
    countryName = river.get('country')
    length = river.find('length')
    if length is not None:
        lengthNumber = river.find('length').text
        riverName = river.find('name').text
        df.loc[-1] = [riverName,countryName, lengthNumber]
        df.index = df.index + 1
df.loc[df['Length'].idxmax()]

River Name    Selenge
Country         R MNG
Length            992
Name: 111, dtype: object

In [73]:
#$ b) largest lake
df = pd.DataFrame(columns=['Lake Name','Country','Area'])
for lake in document.getiterator('lake'):
    countryName = lake.get('country')
    area = lake.find('area')
    if area is not None:
        areaNumber = lake.find('area').text
        lakeName = lake.find('name').text
        df.loc[-1] = [lakeName,countryName, areaNumber]
        df.index = df.index + 1
        
df.loc[df['Area'].idxmax()]

Lake Name    Fort Peck Lake
Country                 USA
Area                    981
Name: 18, dtype: object

In [74]:
#4 c) airport at highest elevation
df = pd.DataFrame(columns=['Airport Name','Country','Elevation'])
for airport in document.getiterator('airport'):
    countryName = airport.get('country')
    elevation = airport.find('elevation')
    if elevation is not None:
        elevationNumber = airport.find('elevation').text
        airportName = airport.find('name').text
        df.loc[-1] = [airportName,countryName, elevationNumber]
        df.index = df.index + 1

df = df.dropna(axis=0, how='any')
df.loc[df['Elevation'].idxmax()]

Airport Name    Mashhad
Country              IR
Elevation           995
Name: 778, dtype: object