# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':')
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )

In [7]:
def infant_mort():
                z={}
                for element in document.iterfind('country'):
                    
                    for x in (element.findall('.//infant_mortality')):
                        y={element.find('name').text:float(x.text)}
                        z.update(y)
                return z
a=infant_mort()
b=sorted(a.items(), key=lambda x: x[1], reverse=True)
b[0:10]

[('Western Sahara', 145.82),
 ('Afghanistan', 117.23),
 ('Mali', 104.34),
 ('Somalia', 100.14),
 ('Central African Republic', 92.86),
 ('Guinea-Bissau', 90.92),
 ('Chad', 90.3),
 ('Niger', 86.27),
 ('Angola', 79.99),
 ('Burkina Faso', 76.8)]

In [8]:
def city_pop():
    
    
    cp={}
    for element in document.iterfind('country'):
        capitals_string = ''
        for subelement in element.getiterator('city'):
            for z in (subelement.findall('.//population')):
                y={subelement.find('name').text:int(z.text)}
                for item in y.keys():
                    if item in cp.keys():
                        if int(y[item]) > int(cp[item]):
                              cp.update(y)
                    else:
                            cp.update(y)
    return cp
            
a=city_pop()
b=sorted(a.items(), key=lambda x: x[1], reverse=True)
b[0:10]

[('Shanghai', 22315474),
 ('Istanbul', 13710512),
 ('Delhi', 12877470),
 ('Mumbai', 12442373),
 ('Moskva', 11979529),
 ('Beijing', 11716620),
 ('São Paulo', 11152344),
 ('Tianjin', 11090314),
 ('Guangzhou', 11071424),
 ('Shenzhen', 10358381)]

In [9]:
def ethnic_pop():
    
    ep={}
    for element in document.iterfind('country'):
        for subelement in (element.findall('./ethnicgroup')):
            for perc in subelement.attrib.values():
                percn=float(perc)
                for subpop in (element.findall('./population')):
                    pop={}
                    y={element.find('name').text:int(subpop.text)}
                    pop.update(y)
                percn=float(perc)*int(subpop.text)/100
            y={subelement.text:int(percn)}
            ep.update(y)
            
    return ep
            
a=ethnic_pop()
b=sorted(a.items(), key=lambda x: x[1], reverse=True)
b[0:10]

[('Han Chinese', 1245058800),
 ('Indo-Aryan', 871815583),
 ('Dravidian', 302713744),
 ('Bengali', 146776916),
 ('Japanese', 126534212),
 ('Eastern Hamitic', 82830376),
 ('Mulatto', 78065896),
 ('Viet/Kinh', 76078375),
 ('English', 53592326),
 ('Mediterranean Nordic', 46815916)]

In [10]:
def river_len():
    
    
    cp={}
    for element in document.iterfind('river'):
        for subelement in element.getiterator('length'):
            y={element.find('name').text:float(element.find('length').text)}
            #print(y)
            cp.update(y)
    return cp

def river_coun():
    rc={}
    for element in document.iterfind('river'):
        for subelement in element.getiterator('length'):
            y={element.attrib['country']:element.find('name').text}
            #print(y)
            rc.update(y)
    return rc 
      
a=river_len()
b=river_coun()
df_a=pd.DataFrame(list(a.items()),columns=['River','Length'])
df_b=pd.DataFrame(list(b.items()),columns=['Country','River'])
df=df_a.merge(df_b,how='inner',on='River')
df.sort_values(by='Length').tail(1)

Unnamed: 0,River,Length,Country
43,Amazonas,6448.0,CO BR PE


In [11]:
def lake_area():
    
    
    la={}
    for element in document.iterfind('lake'):
        for subelement in element.getiterator('area'):
            y={element.find('name').text:float(element.find('area').text)}
            #print(y)
            la.update(y)
    return la

def lake_coun():
    lc={}
    for element in document.iterfind('lake'):
        for subelement in element.getiterator('area'):
            y={element.attrib['country']:element.find('name').text}
            #print(y)
            lc.update(y)
    return lc 
      
a=lake_area()
b=lake_coun()
df_a=pd.DataFrame(list(a.items()),columns=['Lake','Area'])
df_b=pd.DataFrame(list(b.items()),columns=['Country','Lake'])
df=df_a.merge(df_b,how='inner',on='Lake')
df.sort_values(by='Area').tail(1)



Unnamed: 0,Lake,Area,Country
24,Caspian Sea,386400.0,R AZ KAZ IR TM


In [12]:
def airport_elevation():
    
    
    ae={}
    for element in document.iterfind('airport'):
        for subelement in element.getiterator('elevation'):
            try:
                y={element.find('name').text:float(element.find('elevation').text)}
                #print(y)
                ae.update(y)
            except:
                    pass
                    #print('No elevation data')
    return ae

def airport_coun():
    ac={}
    for element in document.iterfind('airport'):
        for subelement in element.getiterator('elevation'):
            y={element.attrib['country']:element.find('name').text}
            #print(y)
            ac.update(y)
    return ac 
      
a=airport_elevation()
b=airport_coun()
df_a=pd.DataFrame(list(a.items()),columns=['Airport','Elevation'])
df_b=pd.DataFrame(list(b.items()),columns=['Country','Airport'])
df=df_a.merge(df_b,how='inner',on='Airport')
df.sort_values(by='Elevation').tail(1)




Unnamed: 0,Airport,Elevation,Country
47,Jorge Wilsterman,2549.0,BOL
