# XML exercise

Using data from [**mondial database**](https://drive.google.com/file/d/14lFT4nWHgwN36ij4XZh6OUuup-K9qLgR/view?usp=sharing) find the answers to following questions:

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

tree = ET.parse('data/mondial.xml')
root = tree.getroot()
print(root.tag) # name of the root?
print(root.attrib) # any attributes to the root?
print(len(root)) # How many children?

mondial
{}
3403


In [2]:
# Q1 - 10 countries with the lowest infant mortality rates
inf_mort = {
    'Country' : [],
    'infant_mortality' : []
}
for country in root.findall('country'):
    # name is child of the country
    NAME = country.findtext('name')
    # infant_mortality is another child of the country
    IM = country.findtext('infant_mortality')
    
    inf_mort['Country'].append(NAME)
    inf_mort['infant_mortality'].append(IM)

df = pd.DataFrame(inf_mort)
df['infant_mortality'] = df['infant_mortality'].astype(float)

df.sort_values(by='infant_mortality').head(10)

Unnamed: 0,Country,infant_mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
8,Spain,2.7
78,Hong Kong,2.73
79,Macao,3.13


In [3]:
# Q2 - 10 cities with the largest population
city_data = {}

for country in root.iterfind('country'):
    cities = country.findall('city')
    if len(cities) == 0:
        cities = country.findall("province/city")
    for subelement in cities:
        this_element= {}
        try:
            this_element["population"] = int(subelement.findall('population')[-1].text)
        except:
            pass
            #print element.findall('city')
        city_data[subelement.findtext('name')] = this_element
city_data_pd = pd.DataFrame(city_data).transpose()
city_data_pd.sort_values("population", ascending=False)[:10]

Unnamed: 0,population
Shanghai,22315474.0
Karachi,14916456.0
Lagos,13745000.0
Istanbul,13710512.0
Mumbai,12442373.0
Moskva,11979529.0
Beijing,11716620.0
Kinshasa,11575000.0
São Paulo,11152344.0
Lahore,11126285.0


In [4]:
# Q3 - name and country of a) longest river

river_data = {
    'river_name' : [],
    'Country' : [],
    'length' : []
}

for river in root.iterfind('river'):
    riverName = river.findtext('name')
    riverCountry = river.get('country')
    riverLength = river.findtext('length')

    river_data['river_name'].append(riverName)
    river_data['Country'].append(riverCountry)
    river_data['length'].append(riverLength)
    
df = pd.DataFrame(river_data)
df['length'] = df['length'].astype(float)
df.sort_values('length', ascending=False).head(1)

Unnamed: 0,river_name,Country,length
214,Yangtze,CN,6380.0


In [5]:
# Q3 - name and country of b) largest lake
lake_data = {
    'lake_name' : [],
    'Country' : [],
    'area' : []
}

for lake in root.iterfind('lake'):
    Name = lake.findtext('name')
    Country = lake.get('country')
    Area = lake.findtext('area')

    lake_data['lake_name'].append(Name)
    lake_data['Country'].append(Country)
    lake_data['area'].append(Area)
    
df = pd.DataFrame(lake_data)
df['area'] = df['area'].astype(float)
df.sort_values('area', ascending=False).head(2)

Unnamed: 0,lake_name,Country,area
59,Caspian Sea,R AZ KAZ IR TM,386400.0
142,Lake Superior,CDN USA,82103.0


In [6]:
# Q3 - name and country of c) airport at highest elevation
airport_data = {
    'airport_name' : [],
    'Country' : [],
    'elevation' : []
}

for airport in root.iterfind('airport'):
    Name = airport.findtext('name')
    Country = airport.get('country')
    Elevation = airport.findtext('elevation')

    airport_data['airport_name'].append(Name)
    airport_data['Country'].append(Country)
    airport_data['elevation'].append(Elevation)
    
df = pd.DataFrame(airport_data)
df['elevation'] = df['elevation'].astype(float)
df.sort_values('elevation', ascending=False).head(1)

Unnamed: 0,airport_name,Country,elevation
81,El Alto Intl,BOL,4063.0
