# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [56]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [40]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [165]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text 
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality s
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [78]:
tree = ET.parse( './data/mondial_database.xml' )
root = tree.getroot()

 10 countries with the lowest infant mortality

In [79]:
country_infmort=[]
for i in range(len(root)):
   try:
    country_infmort.append((root[i].find('name').text, root[i].find('infant_mortality').text))
   except:
    continue

In [80]:
country_infmort.sort(key=lambda x: x[1])
print "10 countries with lowest infant mortality rates are \n"
print map(lambda x: x[0],country_infmort[:10])

10 countries with lowest infant mortality rates are 

['Monaco', 'Romania', 'Fiji', 'Brunei', 'Grenada', 'Mauritius', 'Panama', 'Seychelles', 'United Arab Emirates', 'Barbados']


10 cities with largest population

In [81]:
city_pop=[]
for i in range(len(root)):
    for subelement in root[i].getiterator('city'):
        city=subelement.find('name').text 
        year_pop = {}
        for pop in root[i].iterfind('./city/population'):
            year_pop.update({pop.attrib['year']: int(pop.text)})
            try:
                latest_pop = year_pop[str(max(map(int,year_pop.keys())))]
                city_pop.append((city,latest_pop))
            except:
                continue
city_pop.sort(key=lambda x: x[1])
print "Top 10 cities with highest population in 2011 are \n"
print map(lambda x: x[0],city_pop[-10:])

Top 10 cities with highest population in 2011 are 

['Mokpo', 'Jeju', 'Gunsan', 'Chuncheon', 'Yeosu', 'Goyang', 'Yongin', 'Bucheon', 'Ansan', 'Anyang']


In [83]:
country_pop = []
ethnic_groups = {}
for i in range(len(root)):
    country = root[i].find('name').text
    year_pop = {}
    for subelement in root[i].iterfind('population'):
        year_pop.update({subelement.attrib['year']: int(subelement.text)})
    try:
        latest_pop = year_pop[str(max(map(int,year_pop.keys())))]
        country_pop.append((country,latest_pop))
        for ethnic_element in root[i].iterfind('ethnicgroup'):
            new_pop = float(ethnic_element.attrib['percentage'])*latest_pop
            if ethnic_element in ethnic_groups.keys():
                ethnic_groups[ethnic_element.text] = ethnic_groups[ethnic_element.text] + new_pop
            else:
                ethnic_groups.update({ethnic_element.text: new_pop})
    except:
        continue
print "The top 10 ethnicities with largest overall populations\n"
print sorted(ethnic_groups, key=ethnic_groups.get)[-10:]

The top 10 ethnicities with largest overall populations

['Mediterranean Nordic', 'English', 'Viet/Kinh', 'Mulatto', 'Eastern Hamitic', 'Japanese', 'Bengali', 'Dravidian', 'Indo-Aryan', 'Han Chinese']


In [84]:
country_riv = []

for i in range(len(root)): 
        for subelement in root[i].getiterator('river'):
                try:
                    river=subelement.find('name').text
                    country=subelement.attrib['country']
                    length=int(subelement.find('length').text)
                    country_riv.append((country,river,length))
                except:
                    continue 
            
country_riv.sort(key=lambda x: x[2])
print "country with longest river"
print map(lambda x: x,country_riv[-1:])


country with longest river
[('CO BR PE', 'Amazonas', 6448)]


In [16]:
country_air = []
for i in range(len(root)):
    for subelement in root[i].getiterator('airport'):
        try:
                lake=subelement.find('name').text
                country=subelement.attrib['country']
                length=int(subelement.find('elevation').text)
                country_air.append((country,lake,length))
        except:
            continue 
            
country_air.sort(key=lambda x: x[2])

print "airport at highest elevation"
print map(lambda x: x[:2],country_air[-1:])


airport at highest elevation
[('BOL', 'El Alto Intl')]


In [15]:
country_lake = []
for i in range(len(root)):
    for subelement in root[i].getiterator('lake'):
        try:
                lake=subelement.find('name').text
                country=subelement.attrib['country']
                length=int(subelement.find('area').text)
                country_lake.append((country,lake,length))
        except:
            continue 
            
country_lake.sort(key=lambda x: x[2])

print "country with largest lake"
print map(lambda x: x[:2],country_lake[-1:])


country with largest lake
[('R AZ KAZ IR TM', 'Caspian Sea')]


In [33]:
import pandas as pd
cc=pd.DataFrame(country_code)
cc = cc.rename(columns={0: 'country', 1: 'code'})
cc.head()

Unnamed: 0,country,code
0,Albania,AL
1,Greece,GR
2,Macedonia,MK
3,Serbia,SRB
4,Montenegro,MNE


In [99]:
cr=pd.DataFrame(country_riv[-1:])
cr = cr.rename(columns={0: 'code', 1: 'river'})
cr=cr[['code','river']]
t=cr.code
t1 = t.str.split().str[0]
t2 = t.str.split().str[1]
t3 = t.str.split().str[2]
t4 = t.str.split().str[3]
t5 = t.str.split().str[4]
t=pd.concat([t,t1,t2,t3,t4,t5])
t=pd.DataFrame(t)
t= t.rename(columns={0: 'code'})
cr = pd.merge(cr, t, how='right')
cr.river=cr.river.ffill()
cr

Unnamed: 0,code,river
0,CO BR PE,Amazonas
1,CO,Amazonas
2,BR,Amazonas
3,PE,Amazonas
4,,Amazonas
5,,Amazonas


In [77]:
cl=pd.DataFrame(country_lake[-1:])
cl = cl.rename(columns={0: 'code', 1: 'lake'})
cl=cl[['code','lake']]
t=cl.code
t1 = t.str.split().str[0]
t2 = t.str.split().str[1]
t3 = t.str.split().str[2]
t4 = t.str.split().str[3]
t5 = t.str.split().str[4]
t=pd.concat([t,t1,t2,t3,t4,t5])
t=pd.DataFrame(t)
t= t.rename(columns={0: 'code'})
cl = pd.merge(cl, t, how='right')
cl.lake=cl.lake.ffill()

NameError: name 'pd' is not defined

In [45]:
ca=pd.DataFrame(country_air[-1:])
ca = ca.rename(columns={0: 'code', 1: 'airport'})
ca=ca[['code','airport']]

In [49]:
dfMerged = pd.merge(cc, ca,
              left_on=['code'],
              right_on=['code'],
              how='inner')
dfMerged[['country','airport']]

Unnamed: 0,country,airport
0,Bolivia,El Alto Intl


In [94]:
dfMerged = pd.merge(cc, cl,
              left_on=['code'],
              right_on=['code'],
              how='inner')
dfMerged[['country','lake']]

Unnamed: 0,country,lake
0,Russia,Caspian Sea
1,Iran,Caspian Sea
2,Turkmenistan,Caspian Sea
3,Azerbaijan,Caspian Sea
4,Kazakhstan,Caspian Sea


In [100]:
dfMerged = pd.merge(cc, cr,
              left_on=['code'],
              right_on=['code'],
              how='inner')
dfMerged[['country','river']]

Unnamed: 0,country,river
0,Colombia,Amazonas
1,Brazil,Amazonas
2,Peru,Amazonas
