In [15]:
import os
import urllib.request
import pandas as pd
from functools import reduce

from utils import *

In [16]:
urls = [
    "https://data.un.org/_Docs/SYB/CSV/SYB67_1_202411_Population,%20Surface%20Area%20and%20Density.csv",
    "https://data.un.org/_Docs/SYB/CSV/SYB67_327_202411_International%20Migrants%20and%20Refugees.csv",
    "https://data.un.org/_Docs/SYB/CSV/SYB67_246_202411_Population%20Growth,%20Fertility%20and%20Mortality%20Indicators.csv",
    "https://data.un.org/_Docs/SYB/CSV/SYB61_253_Population%20Growth%20Rates%20in%20Urban%20areas%20and%20Capital%20cities.csv",
    "https://data.un.org/_Docs/SYB/CSV/SYB67_230_202411_GDP%20and%20GDP%20Per%20Capita.csv",
]

In [17]:
names_transforms_array = [
    ['Population mid-year estimates (millions)', 'Population (millions)', string_with_commas_to_float],
    ['Sex ratio (males per 100 females)', 'Sex ratio (males to females)', percentage_str_to_prop_float],
    ['Population aged 0 to 14 years old (percentage)', 'Population aged 0 to 14 years old (proportion)', percentage_str_to_prop_float],
    ['Population aged 60+ years old (percentage)', 'Population aged 60+ years old (proportion)', percentage_str_to_prop_float],
    ['Population density', 'Population density (per km2)', string_with_commas_to_float],

    ['International migrant stock: Both sexes (number)', 'Number of international migrants', string_with_commas_to_int],
    ['International migrant stock: Both sexes (% total population)', 'International migrants in proportion to population', percentage_str_to_prop_float],

    ['Total fertility rate (children per women)', None, string_with_commas_to_float],
    ['Under five mortality rate for both sexes (per 1,000 live births)', 'Under five mortality rate for both sexes (proportion to live births)', perthousand_str_to_prop_float],
    ['Life expectancy at birth for both sexes (years)', 'Life expectancy at birth (years)', string_with_commas_to_float],
    ['Population annual rate of increase (percent)', 'Population annual rate of increase (proportion)', percentage_str_to_prop_float],

    ['Urban population (percent)', 'Urban population (proportion)', string_with_commas_to_float],
    ['Urban population (percent growth rate per annum)', 'Urban population (growth rate per annum)', percentage_str_to_prop_float],
    ['Rural population (percent growth rate per annum)', 'Rural population (growth rate per annum)', percentage_str_to_prop_float],
    
    ['GDP in current prices (millions of US dollars)', 'GDP (millions of US dollars)', string_with_commas_to_float],
    ['GDP per capita (US dollars)', None, string_with_commas_to_float],
    ['GDP real rates of growth (percent)', 'GDP real rates of growth', percentage_str_to_prop_float],
]

In [18]:
names_transforms = make_names_transforms_df(names_transforms_array)

names_transforms.tail(5)

Unnamed: 0,name,newname,transform
12,Urban population (percent growth rate per annum),Urban population (growth rate per annum),<function percentage_str_to_prop_float at 0x7f...
13,Rural population (percent growth rate per annum),Rural population (growth rate per annum),<function percentage_str_to_prop_float at 0x7f...
14,GDP in current prices (millions of US dollars),GDP (millions of US dollars),<function string_with_commas_to_float at 0x7fe...
15,GDP per capita (US dollars),,<function string_with_commas_to_float at 0x7fe...
16,GDP real rates of growth (percent),GDP real rates of growth,<function percentage_str_to_prop_float at 0x7f...


In [19]:
filedir = os.getcwd()

dfs = []

for url in urls:
    df_in = prepare_un_dataset(filedir, url)
    new_dfs = dataset_from_names_and_transforms(df_in, names_transforms)
    dfs = dfs + new_dfs

    # delete tmp file
    os.remove(filedir + '/tmp_data.csv') 

In [20]:
# merge all in one single dataset
df = reduce(lambda  left,right: pd.merge(left,right,on=['Country'], how='outer'), dfs)

# save to csv
df.to_csv(filedir + '/un_SCMdataset.csv', index = False)

In [21]:
df.head()

Unnamed: 0,Country,Population (millions),Sex ratio (males to females),Population aged 0 to 14 years old (proportion),Population aged 60+ years old (proportion),Population density (per km2),Number of international migrants,International migrants in proportion to population,Total fertility rate (children per women),Under five mortality rate for both sexes (proportion to live births),Life expectancy at birth (years),Population annual rate of increase (proportion),Urban population (proportion),Urban population (growth rate per annum),Rural population (growth rate per annum),GDP (millions of US dollars),GDP per capita (US dollars),GDP real rates of growth
0,Afghanistan,42.65,1.02,0.429,0.039,65.7,144098.0,0.004,4.8,0.545,66.3,0.028,25.5,0.04,0.029,15145.0,537.0,0.025
1,Albania,2.79,0.978,0.168,0.235,101.9,48810.0,0.017,1.3,0.083,79.8,-0.007,60.3,0.018,-0.025,11927.0,4094.0,0.037
2,Algeria,46.81,1.041,0.303,0.1,19.7,250378.0,0.006,2.7,0.205,76.5,0.014,72.6,0.029,-0.002,161207.0,4496.0,0.036
3,American Samoa,0.05,1.019,0.27,0.129,233.8,23608.0,0.428,2.3,0.104,73.0,-0.016,87.2,-0.001,0.005,,,
4,Andorra,0.08,1.046,0.119,0.232,174.3,45574.0,0.59,1.1,0.06,84.2,0.013,88.1,-0.017,-0.008,3447.0,48191.0,-0.02
