# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [411]:
from xml.etree import ElementTree as ET

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [412]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
root = document_tree.getroot()

In [413]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [414]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [415]:
document = ET.parse( './data/mondial_database.xml' )
root= document.getroot()

# #1 Find 10 countries with the lowest infant mortality rates

In [425]:
data = []
#search through all the countries
for child in root.findall('country'):
    #find the country name
    if child.find('name') != None:
        name = child.find('name').text
    #find the country's infant mortality rate
    if child.find('infant_mortality') != None:
        infant_mort = float(child.find('infant_mortality').text)
    #find the population.  the highest population should correspond to most recent, so we take the max
    if child.findall('population') != None:
        poplist = []
        for pop in child.findall('population'):
            poplist.append(float(pop.text))
        population = max(poplist)
    data.append([name, infant_mort, population])

    
country_data= pd.DataFrame(data, columns = ['country','infant mortality','country population'])
country_data.sort_values('infant mortality').head(10)

Unnamed: 0,country,infant mortality,country population
38,Monaco,1.81,36845.0
98,Japan,2.13,128057352.0
36,Norway,2.48,5051275.0
117,Bermuda,2.48,64237.0
106,Singapore,2.53,5076700.0
37,Sweden,2.6,9555893.0
10,Czech Republic,2.63,10562214.0
78,Hong Kong,2.73,7071576.0
79,Macao,3.13,552503.0
44,Iceland,3.15,318452.0


# #2 Find the 10 cities with the largest population

In [612]:
data = []
for country in root.findall('country'):
    for city in country.iter('city'):
        #find country name
        country_name= country.find('name').text
        #find the city name
        if city.find('name') != None:
            city_name = city.find('name').text
        #find the city population, and take the highest population number
        if city.findall('population') != None:
            poplist = []
            for pop in city.findall('population'):
                poplist.append(float(pop.text))
            if np.size(poplist) != 0:
                population = max(poplist)
        data.append([city_name,country_name,population])
#make a dataframe and sort it
city_data = pd.DataFrame(data, columns=['city','country','population'])
city_data = city_data.sort_values('population',ascending=False)
#fix formatting
city_data['population']=city_data['population'].map('{:,.0f}'.format)
city_data.head(10)
           

Unnamed: 0,city,country,population
1341,Shanghai,China,22315474
771,Istanbul,Turkey,13710512
1582,Delhi,India,12877470
1527,Mumbai,India,12442373
479,Moskva,Russia,11979529
1340,Beijing,China,11716620
2810,São Paulo,Brazil,11152344
1342,Tianjin,China,11090314
1064,Guangzhou,China,11071424
1067,Shenzhen,China,10358381


# #3 Find 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [426]:
data = []
#find all the countries
countries = root.findall('country')
#find the ethnic groups for each country
for country in countries:
    ethnicgroups = country.findall('ethnicgroup')
    #get the data we want for each ethnic group
    for a in ethnicgroups:
        #get the country's name
        country_name = country.find('name').text
        #get the ethnic group's name
        ethgroup = a.text
        #ethnic groups only attribute is 'percentage'.  we grab just the value from this dict.
        percentage = float(a.attrib['percentage'])
        #add the data
        data.append([country_name,ethgroup, percentage])

#make a dataframe from previous data
df = pd.DataFrame(data, columns=['country','ethnic group','percentage'])

#merge in countries population. we merge it in from the dataframe used in Q1.
df = pd.merge(df,country_data,on='country')
del df['infant mortality']

#add a column for ethnic population
df['ethnic population']= df['percentage'] / 100 * df['country population']

#sort by ethnic population
df = df.sort_values('ethnic population',ascending=False)
df.head()



Unnamed: 0,country,ethnic group,percentage,country population,ethnic population
176,China,Han Chinese,91.5,1360720000.0,1245059000.0
221,India,Indo-Aryan,72.0,1210855000.0,871815600.0
220,India,Dravidian,25.0,1210855000.0,302713700.0
345,United States,European,79.96,318857100.0,254958100.0
520,Nigeria,African,99.0,164294500.0,162651600.0


In [427]:
# group the data by ethnic group and then sum up the populations for each
ethnic_pops = df['ethnic population'].groupby(df['ethnic group']).sum()

#put into dataframe and sort
ethnic_pops = pd.DataFrame(ethnic_pops)
ethnic_pops = ethnic_pops.sort_values('ethnic population',ascending=False)

#change the format
ethnic_pops['ethnic population'] = ethnic_pops['ethnic population'].map('{:,.0f}'.format)

#reindex to make top 10
ethnic_pops = ethnic_pops.reset_index()
ethnic_pops.index = np.arange(1, len(ethnic_pops) + 1)
ethnic_pops.head(10)


Unnamed: 0,ethnic group,ethnic population
1,Han Chinese,1245058800
2,Indo-Aryan,871815583
3,European,494939516
4,African,318359698
5,Dravidian,302713744
6,Mestizo,157855273
7,Bengali,146776917
8,Russian,136866551
9,Japanese,127289008
10,Malay,121993620


# #4 Find the name and country of:
## a) longest river

In [528]:
data = []
rivers = root.findall('river')
for river in rivers:
    #grab the river's name, country name, and length
    if river.find('length') != None:
        river_name = river.find('name').text
        countries = river.findall('located')
        length = float(river.find('length').text)
        #since rivers border multiple countries, we need to get each one.
        for country in countries:
            country_abbrev = country.attrib['country']
            data.append([river_name,country_abbrev,length])
all_rivers= pd.DataFrame(data,columns=['river','country_abbrev','length'])
all_rivers.head()

Unnamed: 0,river,country_abbrev,length
0,Glomma,N,604.0
1,Lagen,N,322.0
2,Goetaaelv,S,93.0
3,Klaraelv,S,460.0
4,Klaraelv,N,460.0


In [615]:
#make dataframe of countries matched with their codes
countries_and_codes = []
all_countries = root.findall('country')
for a in all_countries:
    name = a.find('name').text
    code = a.attrib['car_code']
    countries_and_codes.append([name,code])
country_df = pd.DataFrame(countries_and_codes,columns=['country_name','country_abbrev'])
country_df.head(5)

Unnamed: 0,country_name,country_abbrev
0,Albania,AL
1,Greece,GR
2,Macedonia,MK
3,Serbia,SRB
4,Montenegro,MNE


In [616]:
#merge the country names to previous data, and then sort by length.
river_data = pd.merge(all_rivers, country_df,on='country_abbrev')
river_data.sort_values('length',ascending=False).head(5)

Unnamed: 0,river,country_abbrev,length,country_name
258,Amazonas,PE,6448.0,Peru
250,Amazonas,BR,6448.0,Brazil
247,Amazonas,CO,6448.0,Colombia
184,Jangtse,CN,6380.0,China
183,Hwangho,CN,4845.0,China


## b) largest lake

In [638]:
data = []
lakes = root.findall('lake')
for lake in lakes:
    #get the name, area, depth, and country code for each lake
    name = lake.find('name').text
    if lake.find('area') != None:
        area = float(lake.find('area').text)
    else:
        area = 0
    if lake.find('depth') != None:
        depth = float(lake.find('depth').text)
    else:
        depth = 0
    countries = lake.findall('located')
    for country in countries:
        country_abbrev = country.attrib['country']
        data.append([name,country_abbrev,area,depth])
all_lakes = pd.DataFrame(data,columns=['lake','country_abbrev','area','depth'])

#merge in country name again
all_lakes = pd.merge(all_lakes,country_df)
#new column of area * depth
all_lakes['volume']= all_lakes['area']*all_lakes['depth']
#sort by volume
all_lakes = all_lakes.sort_values('volume',ascending=False)
#reformat
all_lakes['volume'] = all_lakes['volume'].map('{:,.0f}'.format)
all_lakes.head(10)

Unnamed: 0,lake,country_abbrev,area,depth,country_name,volume
67,Caspian Sea,TM,386400.0,995.0,Turkmenistan,384468000
64,Caspian Sea,KAZ,386400.0,995.0,Kazakhstan,384468000
63,Caspian Sea,IR,386400.0,995.0,Iran,384468000
50,Caspian Sea,R,386400.0,995.0,Russia,384468000
46,Ozero Baikal,R,31492.0,1637.0,Russia,51552404
89,Lake Tanganjika,ZRE,32893.0,1470.0,Zaire,48352710
99,Lake Tanganjika,Z,32893.0,1470.0,Zambia,48352710
94,Lake Tanganjika,EAT,32893.0,1470.0,Tanzania,48352710
117,Lake Superior,CDN,82103.0,405.0,Canada,33251715
123,Lake Superior,USA,82103.0,405.0,United States,33251715


## c) airport at highest elevation

In [670]:
root[1510].attrib

{'city': 'cty-Azerbaijan-Baku', 'country': 'AZ', 'iatacode': 'GYD'}

In [677]:
data = []
airports = root.findall('airport')
for airport in airports:
    #for each airport, get the name, country code, city, elevation
    name = airport.find('name').text
    country_code = airport.attrib['country']
    if 'city' in airport.attrib:
        city = airport.attrib['city']
    else:
        city= 'None'
    if airport.find('elevation').text != None:
        elevation = float(airport.find('elevation').text)
    else:
        elevation = 0
    data.append([name,country_code,city,elevation])
    
airport_df = pd.DataFrame(data,columns=['name','country_abbrev','city','elevation'])
#merge in country names
airport_df = pd.merge(airport_df,country_df)
airport_df.sort_values('elevation',ascending=False).head()
    

Unnamed: 0,name,country_abbrev,city,elevation,country_name
80,El Alto Intl,BOL,cty-BOL-1,4063.0,Bolivia
219,Lhasa-Gonggar,CN,cty-China-324,4005.0,China
241,Yushu Batang,CN,cty-China-281,3963.0,China
813,Juliaca,PE,cty-Peru-15,3827.0,Peru
815,Teniente Alejandro Velasco Astete Intl,PE,cty-Peru-10,3311.0,Peru
