# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
import pandas as pd
import numpy as np
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)
    

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
ori_document = ET.parse( './data/mondial_database.xml' )

In [7]:
ori_document

<xml.etree.ElementTree.ElementTree at 0x92e5ba8>

### Answer 1

In [8]:
columns1 = ['Country','IM']

df1 = pd.DataFrame( columns=columns1)



In [9]:

i = 0
for country in ori_document.iterfind('country'):
    if country.find('infant_mortality') is None:
        pass
    else:
        name = country.find('name').text
        im = country.find('infant_mortality').text

        df1.set_value(i, 'Country', name)
        df1.set_value(i, 'IM', float(im))
        i = i + 1
df1.sort_values(df1.columns[1]).reset_index(drop=True).head(10)

Unnamed: 0,Country,IM
0,Monaco,1.81
1,Japan,2.13
2,Bermuda,2.48
3,Norway,2.48
4,Singapore,2.53
5,Sweden,2.6
6,Czech Republic,2.63
7,Hong Kong,2.73
8,Macao,3.13
9,Iceland,3.15


### Answer 2

In [43]:
j = 0
columns2 = ['cities','population']
df2 = pd.DataFrame()

for country in ori_document.iterfind('country'):
    for city in country.getiterator('city'):
        city.find('population')
        if city.find('population') is None:
            pass
    
        else:
            name = city.find('name').text
            pop = int(city.find('population').text)
        
            df2.set_value(j, 'Country', name)
            df2.set_value(j, 'population', pop)
            j = j + 1
df2.sort_values(df2.columns[1],ascending = False).reset_index(drop=True).head(10)


Unnamed: 0,Country,population
0,Seoul,10229262.0
1,Mumbai,9925891.0
2,São Paulo,9412894.0
3,Jakarta,8259266.0
4,Shanghai,8205598.0
5,Ciudad de México,8092449.0
6,Moskva,8010954.0
7,Tokyo,7843000.0
8,Beijing,7362426.0
9,Delhi,7206704.0


### Answer 3

In [68]:
ori_set = []
for name in ori_document.iterfind('country'):
    country = name.find('name').text
    for ethnic in name.iter('ethnicgroup'):
        eth =[]
        
        if  pd.isnull(ethnic):    
            continue
        else:
            eth.append(country)
            eth.append(ethnic.text)
            if pd.isnull(ethnic.attrib['percentage']): 
                eth.append('nan')
            else:
                eth.append(ethnic.attrib['percentage'])

        ori_set.append(eth)
       
ethnic = pd.DataFrame(ori_set, columns=['Country', 'Ethnic_Group', 'Percent_Pop'] )


In [69]:
j = 0
columns2 = ['country','population']
df3 = pd.DataFrame()

for country in ori_document.iterfind('country'):
    
        
        if country.find('population') is None:
            pass
    
        else:
            name = country.find('name').text
            pop = int(country.find('population').text)
        
            df3.set_value(j, 'Country', name)
            df3.set_value(j, 'population', pop)
            j = j + 1


In [70]:
ethnic = ethnic.merge(df3, how='left', on='Country')
ethnic['population_grp'] = (ethnic['Percent_Pop'].astype(float)/100) * ethnic['population']
ethnic.sort_values(ethnic.columns[4],ascending = False).reset_index(drop=True).head(10)

Unnamed: 0,Country,Ethnic_Group,Percent_Pop,population,population_grp
0,China,Han Chinese,91.5,543776080.0,497555100.0
1,India,Indo-Aryan,72.0,238396327.0,171645400.0
2,United States,European,79.96,157813040.0,126187300.0
3,Russia,Russian,79.8,102798657.0,82033330.0
4,Japan,Japanese,99.4,82199470.0,81706270.0
5,Germany,German,91.5,68230796.0,62431180.0
6,India,Dravidian,25.0,238396327.0,59599080.0
7,United Kingdom,English,83.6,50616012.0,42314990.0
8,Nigeria,African,99.0,37859744.0,37481150.0
9,Indonesia,Javanese,45.0,72592192.0,32666490.0
