In [1]:
from un_wpp import *

In [2]:
"""Population table."""
import pandas as pd
from typing import Dict, Tuple, List, Any

# Something

In [19]:
# rename columns
COLUMNS_ID: Dict[str, str] = {
    "location": "location",
    "time": "year",
    "variant": "variant",
    "agegrp": "age",
}
COLUMNS_METRICS: Dict[str, Dict[str, Any]] = {
    "sex_ratio": {
        "name": "sex_ratio",
        "sex": "none",
    },
    "popmale": {
        "name": "population",
        "sex": "male",
        "operation": lambda x: (x * 1000),
    },
    "popfemale": {
        "name": "population",
        "sex": "female",
        "operation": lambda x: (x * 1000),
    },
    "poptotal": {
        "name": "population",
        "sex": "all",
        "operation": lambda x: (x * 1000),
    },
}
COLUMNS_ORDER: List[str] = [
    "location",
    "year",
    "metric",
    "sex",
    "age",
    "variant",
    "value",
]

In [3]:
meadow_path = base_path / "data/meadow/un/2022/un_wpp"
ds = catalog.Dataset(meadow_path)
# country rename
country_std = load_country_mapping()

In [10]:
x = ds["population"].reset_index()

In [12]:
x[x.time < 2022].variant.unique()

['Medium']
Categories (3, object): ['Medium', 'High', 'Low']

In [4]:
df_population = process_population(ds["population"], country_std)

In [8]:
df_population[
    (df_population.location == "South Africa")
    & (df_population.year == 1963)
    & (df_population.sex == "female")
    & (df_population.age == "40-49")
    & (df_population.metric == "population")
]

Unnamed: 0,location,year,metric,sex,age,variant,value
4462866,South Africa,1963,population,female,40-49,medium,756076.0
4462867,South Africa,1963,population,female,40-49,high,0.0
4462868,South Africa,1963,population,female,40-49,low,0.0


In [13]:
df = ds["population"]

In [14]:
df = pd.DataFrame(df)
df = df.reset_index()
df = df.assign(location=df.location.map(country_std))
df = df.dropna(subset=["location"])

In [20]:
# Estimate sex_ratio
df = df.assign(sex_ratio=(100 * df.popmale / df.popfemale).round(2))
# Unpivot
df = df.melt(COLUMNS_ID.keys(), COLUMNS_METRICS.keys(), "metric", "value")
# Scale units
ops = {k: v.get("operation", lambda x: x) for k, v in COLUMNS_METRICS.items()}

In [21]:
for m in df.metric.unique():
    df.loc[df.metric == m, "value"] = ops[m](df.loc[df.metric == m, "value"])
# Rename columns, metric names
df = df.rename(columns=COLUMNS_ID)
df = df.assign(
    metric=df.metric.map({k: v["name"] for k, v in COLUMNS_METRICS.items()}),
    sex=df.metric.map({k: v["sex"] for k, v in COLUMNS_METRICS.items()}),
    variant=df.variant.apply(lambda x: x.lower()),
)

In [26]:
# Column order
df = df[COLUMNS_ORDER]
# Discard unmapped regions
df = df.dropna(subset=["location"])

In [22]:
df[
    (df.location == "South Africa")
    & (df.year == 1963)
    & (df.sex == "female")
    & (df.age == "40-49")
    & (df.metric == "population")
]

Unnamed: 0,location,year,variant,age,metric,value,sex


In [27]:
df[df.year < 2022].variant.unique()

['medium']
Categories (3, object): ['medium', 'high', 'low']

In [36]:
from population import (
    add_metrics,
    _add_metric_sexratio,
    _add_metric_population,
    _add_metric_population_change,
)

In [37]:
df_sr = _add_metric_sexratio(df)

In [38]:
df_p_granular, df_p_broad = _add_metric_population(df)

In [39]:
df_p_diff = _add_metric_population_change(df)

In [31]:
dfc = add_metrics(df.copy())

In [43]:
df_p_diff[df_p_diff.year < 2022].variant.unique()

['medium']
Categories (3, object): ['medium', 'high', 'low']

In [51]:
df_p = df[df.metric == "population"]
# Basic age groups
age_map = {
    **{str(i): f"{i - i%5}-{i + 4 - i%5}" for i in range(0, 20)},
    **{str(i): f"{i - i%10}-{i + 9 - i%10}" for i in range(20, 100)},
    **{"100+": "100-"},
}
df_p_granular = df_p.assign(age=df_p.age.map(age_map))
df_p_granular = df_p_granular.groupby(
    ["location", "year", "metric", "sex", "age", "variant"],
    as_index=False,  # , observed=True
).sum()

In [52]:
df_p_granular[df_p_granular.year < 2022].variant.unique()

['medium', 'high', 'low']
Categories (3, object): ['medium', 'high', 'low']

---

# regular

In [3]:
meadow_path = base_path / "data/meadow/un/2022/un_wpp"
ds = catalog.Dataset(meadow_path)
# country rename
country_std = load_country_mapping()

In [5]:
df_population = process_population(ds["population"], country_std)

In [10]:
df_population[
    (df_population.location == "Afghanistan")
    & (df_population.year == 1950)
    & (df_population.metric == "population")
    & (df_population.sex == "all")
    & (df_population.age == "0-4")
]

Unnamed: 0,location,year,metric,sex,age,variant,value
839553,Afghanistan,1950,population,all,0-4,medium,1248282.0


In [11]:
df_fertility = process_fertility(ds["fertility"], country_std)

In [12]:
df_demographics = process_demographics(ds["demographics"], country_std)

In [13]:
df_depratio = process_depratio(ds["dependency_ratio"], country_std)

In [14]:
df_deaths = process_deaths(ds["deaths"], country_std)

In [16]:
df = merge_dfs([df_population, df_fertility, df_demographics, df_depratio, df_deaths])

In [18]:
x = df.reset_index()

In [19]:
x.iloc[:359]

Unnamed: 0,location,year,metric,sex,age,variant,value
0,Afghanistan,1950,birth_rate,all,all,estimates,4.886600e+01
1,Afghanistan,1950,births,all,10-14,estimates,4.312000e+03
2,Afghanistan,1950,births,all,15-19,estimates,4.981800e+04
3,Afghanistan,1950,births,all,20-24,estimates,9.218900e+04
4,Afghanistan,1950,births,all,25-29,estimates,8.598000e+04
...,...,...,...,...,...,...,...
354,Afghanistan,1950,net_migration,all,all,estimates,6.161000e+03
355,Afghanistan,1950,net_migration_rate,all,all,estimates,8.240000e-01
356,Afghanistan,1950,population,all,0,estimates,3.017350e+05
357,Afghanistan,1950,population,all,0-4,estimates,1.248282e+06


In [20]:
x[
    (x.location == "Afghanistan")
    & (x.year == 1950)
    & (x.metric == "population")
    & (x.sex == "all")
    & (x.age == "0-4")
]

Unnamed: 0,location,year,metric,sex,age,variant,value
357,Afghanistan,1950,population,all,0-4,estimates,1248282.0


In [22]:
df_wide = x.pivot(
    index=["location", "year", "sex", "age", "variant"],
    columns="metric",
    values="value",
)

In [70]:
xx = x.groupby(
    ["location", "year", "sex", "age", "variant", "metric"],
)

In [71]:
y = xx.metric.nunique()

In [72]:
y.sort_values()

location                  year  sex     age       variant         metric           
Afghanistan               1950  all     0         estimates       deaths               1
Northern Mariana Islands  2073  all     15-24     medium          population_broad     1
                                        16        high            deaths               1
                                                                  population_change    1
                                                  low             deaths               1
                                                                                      ..
Gibraltar                 2036  female  32        high            population_change    1
                                                  low             deaths               1
                                                                  population_change    1
                                        30        medium          deaths               1
Zimbabwe                  

In [16]:
x = df.reset_index()

In [14]:
x = x.set_index(["location", "year", "sex", "age", "variant"])

In [35]:
xx = df_population.groupby(["location", "year", "sex", "age", "variant", "metric"])

In [None]:
xx.value.count()

In [24]:
ans = _

In [25]:
ans.sort_values()

location                   year  sex     age    variant    metric           
Afghanistan                1950  all     0      estimates  deaths               1
Northern Mariana Islands   1978  none    70     estimates  population_change    1
                                                           sex_ratio            1
                                         71     estimates  population_change    1
                                         72     estimates  population_change    1
                                                                               ..
Tokelau                    2013  all     0-4    estimates  population_broad     3
                                                           population           3
South Africa               1963  female  40-49  estimates  population           3
                                         25-64  estimates  population_broad     3
Sint Maarten (Dutch part)  1991  male    10-14  estimates  population           3
Name: value, Length: 

In [34]:
set(i[-1] for i in ans[ans > 1].index)

{'population', 'population_broad'}

In [28]:
ds["population"].loc["South Africa", 1991]

  ds["population"].loc["South Africa", 1991]


Unnamed: 0_level_0,Unnamed: 1_level_0,sortorder,locid,notes,iso3_code,iso2_code,sdmx_code,loctypeid,loctypename,parentid,varid,midperiod,agegrpstart,agegrpspan,popmale,popfemale,poptotal
variant,agegrp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Medium,0,72,710,,ZAF,ZA,710,4,Country/Area,913,2,1991,0,1,604.456970,586.245972,1190.703003
Medium,1,72,710,,ZAF,ZA,710,4,Country/Area,913,2,1991,1,1,611.776978,594.033997,1205.811035
Medium,2,72,710,,ZAF,ZA,710,4,Country/Area,913,2,1991,2,1,629.908020,611.869995,1241.777954
Medium,3,72,710,,ZAF,ZA,710,4,Country/Area,913,2,1991,3,1,639.642029,621.487000,1261.130005
Medium,4,72,710,,ZAF,ZA,710,4,Country/Area,913,2,1991,4,1,636.085022,618.302979,1254.387939
Medium,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Medium,96,72,710,,ZAF,ZA,710,4,Country/Area,913,2,1991,96,1,0.313000,0.479000,0.792000
Medium,97,72,710,,ZAF,ZA,710,4,Country/Area,913,2,1991,97,1,0.208000,0.312000,0.519000
Medium,98,72,710,,ZAF,ZA,710,4,Country/Area,913,2,1991,98,1,0.136000,0.197000,0.333000
Medium,99,72,710,,ZAF,ZA,710,4,Country/Area,913,2,1991,99,1,0.088000,0.121000,0.209000


In [73]:
from owid.catalog.meta import DatasetMeta
from owid.catalog import Table

In [6]:
ds.metadata

DatasetMeta(namespace='un', short_name='un_wpp', title='World Population Prospects 2022, Online Edition', description='World Population Prospects 2022 is the 27th edition of the official estimates and projections of the global population that have been published by the United Nations since 1951. The estimates are based on all available sources of data on population size and levels of fertility, mortality and international migration for 237 countries or areas.', sources=[Source(name='United Nations, Department of Economic and Social Affairs, Population Division (2022)', description=None, url='https://population.un.org/wpp/Download/', source_data_url=None, owid_data_url='https://walden.nyc3.digitaloceanspaces.com/un/2022/un_wpp.zip', date_accessed='2022-07-11', publication_date=None, publication_year=2022, published_by=None, publisher_source=None)], licenses=[License(name='CC BY 3.0 IGO', url='http://creativecommons.org/licenses/by/3.0/igo/')], is_public=True, additional_info=None, versi

In [None]:
metadata = DatasetMeta()

In [None]:
ds_garden = catalog.Dataset.create_empty(dest_dir)

In [11]:
ds_garden = catalog.Dataset.create_empty("./here")

In [12]:
ds_garden.metadata = ds.metadata

In [13]:
ds_garden.save()

In [16]:
t = Table(df)

In [22]:
t = Table(df, metadata={"short_name": "long"})

In [24]:
t.metadata

{'short_name': 'long'}

In [26]:
from owid.catalog.meta import TableMeta

In [28]:
t = Table(df, metadata=TableMeta(short_name="long"))

In [29]:
t.metadata

TableMeta(short_name='long', title=None, description=None, dataset=None, primary_key=[])

In [21]:
ds["fertility"].metadata

TableMeta(short_name='fertility', title=None, description=None, dataset=DatasetMeta(namespace='un', short_name='un_wpp', title='World Population Prospects 2022, Online Edition', description='World Population Prospects 2022 is the 27th edition of the official estimates and projections of the global population that have been published by the United Nations since 1951. The estimates are based on all available sources of data on population size and levels of fertility, mortality and international migration for 237 countries or areas.', sources=[Source(name='United Nations, Department of Economic and Social Affairs, Population Division (2022)', description=None, url='https://population.un.org/wpp/Download/', source_data_url=None, owid_data_url='https://walden.nyc3.digitaloceanspaces.com/un/2022/un_wpp.zip', date_accessed='2022-07-11', publication_date=None, publication_year=2022, published_by=None, publisher_source=None)], licenses=[License(name='CC BY 3.0 IGO', url='http://creativecommons.

In [71]:
df_demographics.variant.isna().sum()

0

In [36]:
df.isna().sum()

location    0
year        0
metric      0
sex         0
age         0
variant     0
value       0
dtype: int64

In [75]:
x = df
a = (x.loc[:, "variant"] == "nan").sum() / len(x)
b = (x.loc[:, "variant"].isna()).sum() / len(x)
print(a)
print(b)

0.0
0.0


In [None]:
548576 / len(df)

0.007536089053379234

In [84]:
df.location.unique()

array(['Afghanistan', 'Africa', 'Albania', 'Algeria', 'American Samoa',
       'Andorra', 'Angola', 'Anguilla', 'Antigua and Barbuda',
       'Argentina', 'Armenia', 'Aruba', 'Asia', 'Australia',
       'Australia/New Zealand', 'Austria', 'Azerbaijan', 'Bahamas',
       'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
       'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia',
       'Bolivia (Plurinational State of)',
       'Bonaire Sint Eustatius and Saba',
       'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'British Virgin Islands', 'Brunei',
       'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Caribbean', 'Cayman Islands', 'Central African Republic',
       'Central America', 'Central Asia', 'Central and Southern Asia',
       'Chad', 'Chile', 'China', 'China, Hong Kong SAR',
       'China, Macao SAR', 'China, Taiwan Province of China', '

In [None]:
df

In [82]:
x = df.groupby(["location", "year", "sex", "age", "variant", "metric"]).value.count()

In [83]:
x.sort_values()

location                       year  sex     age    variant    metric           
Afghanistan                    1950  all     0      estimates  deaths               1
Northern America               2076  all     87     high       population_change    1
                                                    low        deaths               1
                                                               population_change    1
                                                    medium     deaths               1
                                                                                   ..
United Republic of Tanzania    2013  all     all    estimates  population           3
Thailand                       1953  male    100-   estimates  population           3
Slovenia                       1968  male    5-9    estimates  population           3
Kosovo (under UNSC res. 1244)  1962  all     60-69  estimates  population           3
Western Europe                 1972  female  15-19  estimat

In [76]:
df_wide = df.pivot(
    index=["location", "year", "sex", "age", "variant"],
    columns="metric",
    values="value",
)

ValueError: Index contains duplicate entries, cannot reshape