# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [5]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [6]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [8]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [11]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [15]:
tree = ET.parse( './data/mondial_database.xml' )

In [240]:
# 10 countries with the lowest infant mortality rates

#population dictionary with country as key and rate as value
import operator
dict_with_strs={}
for country in root.iterfind('country'):
    name = country.find('name').text
    for i in country.iterfind('infant_mortality'):
        infantmortality = i.text
        dict_with_strs[name] = infantmortality
        

#cast strings to floats for the values
dict_with_numbers = dict((k,float(v)) for k,v in dict_with_strs.items())
dict_with_numbers

#sort dictionary by value in ascending order
srted= sorted(dict_with_numbers.items(), key=operator.itemgetter(1))

#print the first 10 records
count=0
while count < 10:
    print(srted[count])
    count+=1


('Monaco', 1.81)
('Japan', 2.13)
('Norway', 2.48)
('Bermuda', 2.48)
('Singapore', 2.53)
('Sweden', 2.6)
('Czech Republic', 2.63)
('Hong Kong', 2.73)
('Macao', 3.13)
('Iceland', 3.15)


In [242]:
#10 cities with the largest population

#iterate through city and get the population number
city_dict={}
for country in root.iterfind('country'):
    for city in country.iter('city'):
        city_name = city.find('name').text    
        for population in city.iter('population'):
            population_num = population.text
            city_dict[city_name] = population_num

#cast the strings into ints 
dict_with_ints = dict((k,int(v)) for k,v in city_dict.items())
dict_with_ints

#sort the dictionary values
final_sorted= sorted(dict_with_ints.items(), key=lambda kv: kv[1], reverse=True)
final_sorted[:10]


            #print(population['year'])

[('Shanghai', 22315474),
 ('Istanbul', 13710512),
 ('Mumbai', 12442373),
 ('Moskva', 11979529),
 ('Beijing', 11716620),
 ('São Paulo', 11152344),
 ('Tianjin', 11090314),
 ('Guangzhou', 11071424),
 ('Delhi', 11034555),
 ('Shenzhen', 10358381)]

In [241]:
#10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
population_dict={}

#append the population with the country name
for country in root.iterfind('country'): 
    country_name = country.find('name').text
    #print(country_name)
    population_year = []
    
    #grab the year and get the max year for each country
    for population in country.iter('population'):
        population_year.append(population.attrib['year'])
    max_pop=max(population_year)
    
    #grab the population number from the max year
    for population in country.iter('population'):
        if max_pop == population.attrib['year']:
            pop_number = population.text
        else:
            continue
    population_dict[country_name] = pop_number

#convert strings to ints
population_ints = dict((k,int(v)) for k,v in population_dict.items())

        
    
    
#getting the max percentage ethnic group

ethnic_nested={}
for country in root.iterfind('country'):
    country_name = country.find('name').text
    x = []
    for ethnic in country.iter('ethnicgroup'):
        ethnic_dict={}
        ethnic_dict[ethnic.text] = float(ethnic.attrib['percentage'])
        x.append(ethnic_dict)
    ethnic_nested[country_name] = x



#iterating through the nested ethnic group dictionary and multiplying the percentages by the population
ethnic_pop_dict={}
for country, list in ethnic_nested.items():
    for ethnicity in list:
        for ethnicity_name, percentage in ethnicity.items():
            if ethnicity_name in ethnic_pop_dict:
                ethnic_pop_dict[ethnicity_name] += int(percentage*population_ints[country]/100)
            else:
                ethnic_pop_dict[ethnicity_name] = int(percentage*population_ints[country]/100)

ethnic_pop_dict
final = sorted(ethnic_pop_dict.items(), key=lambda kv: kv[1], reverse=True)
final[:10]    
        
    
        
    
        
    
    
    

[('Malay', 89414169),
 ('Eastern Hamitic', 82830376),
 ('Viet/Kinh', 76078375),
 ('Thai', 51084156),
 ('Arab-Berber', 50583950),
 ('Arab', 42402733),
 ('African', 40986968),
 ('Mangbetu-Azande', 27986022),
 ('Han Chinese', 27175500),
 ('Chinese', 22357554)]