In [3]:
from typing import List, Any


import pandas as pd
from pathlib import Path

from etl.paths import BASE_DIR as base_path
from owid import catalog
from owid.catalog import Table
from owid.catalog.meta import TableMeta

from population import process as process_population

# from fertility import process as process_fertility
# from demographics import process as process_demographics
# from dep_ratio import process as process_depratio
# from deaths import process as process_deaths

In [12]:
import pandas as pd
from typing import Dict, Tuple, List, Any

In [13]:
# rename columns
COLUMNS_ID: Dict[str, str] = {
    "location": "location",
    "time": "year",
    "variant": "variant",
    "agegrp": "age",
}
COLUMNS_METRICS: Dict[str, Dict[str, Any]] = {
    "sex_ratio": {
        "name": "sex_ratio",
        "sex": "none",
    },
    "popmale": {
        "name": "population",
        "sex": "male",
        "operation": lambda x: (x * 1000),
    },
    "popfemale": {
        "name": "population",
        "sex": "female",
        "operation": lambda x: (x * 1000),
    },
    "poptotal": {
        "name": "population",
        "sex": "all",
        "operation": lambda x: (x * 1000),
    },
}
COLUMNS_ORDER: List[str] = [
    "location",
    "year",
    "metric",
    "sex",
    "age",
    "variant",
    "value",
]

In [14]:
meadow_path = base_path / "data/meadow/un/2022-07-11/un_wpp"
ds = catalog.Dataset(meadow_path)

In [15]:
country_std = (
    pd.read_csv(
        "un_wpp.country_std.csv",
        index_col="Country",
    )
    .squeeze()
    .to_dict()
)

In [16]:
df = ds["population"]

In [17]:
df = pd.DataFrame(df)
df = df.reset_index()
df = df.assign(location=df.location.map(country_std).astype("category"))
# Discard unmapped regions
df = df.dropna(subset=["location"])
# Estimate sex_ratio
df = df.assign(sex_ratio=(100 * df.popmale / df.popfemale).round(2))
# Unpivot
df = df.melt(COLUMNS_ID.keys(), COLUMNS_METRICS.keys(), "metric", "value")
# Rename columns
df = df.rename(columns=COLUMNS_ID)
# dtypes
df = df.astype({"metric": "category", "year": "uint16"})
# Scale units
ops = {k: v.get("operation", lambda x: x) for k, v in COLUMNS_METRICS.items()}
for m in df.metric.unique():
    df.loc[df.metric == m, "value"] = ops[m](df.loc[df.metric == m, "value"])
# Column value mappings
df = df.assign(
    metric=df.metric.map({k: v["name"] for k, v in COLUMNS_METRICS.items()}).astype(
        "category"
    ),
    sex=df.metric.map({k: v["sex"] for k, v in COLUMNS_METRICS.items()}).astype(
        "category"
    ),
    variant=df.variant.apply(lambda x: x.lower()).astype("category"),
)
# Column order
df = df[COLUMNS_ORDER]

In [45]:
from population import (
    _add_metric_sexratio,
    _add_metric_population,
    _add_metric_population_change,
)

In [46]:
df_sr = _add_metric_sexratio(df)
df_p_granular, df_p_broad = _add_metric_population(df)
df_p_diff = _add_metric_population_change(df_p_granular)

In [55]:
def _add_metric_sexratio_all(df_p_granular):
    # Check
    (df_p_granular.metric.unique() == ["population"]).all()
    # Get M/F values
    df_male = df_p_granular[
        (df_p_granular.age == "all") & (df_p_granular.sex == "male")
    ].rename(columns={"value": "value_male"})
    df_female = df_p_granular[
        (df_p_granular.age == "all") & (df_p_granular.sex == "female")
    ].rename(columns={"value": "value_female"})
    # Check
    assert len(df_male) == len(df_female)
    # Build df
    cols_merge = ["location", "year", "variant"]
    df_ = df_male.merge(df_female[cols_merge + ["value_female"]], on=cols_merge)
    df_ = df_.assign(value=df_.value_male / df_.value_female).drop(
        columns=["value_male", "value_female"]
    )
    return df_

True

In [51]:
df_

Unnamed: 0,location,year,metric,sex,age,variant,value
0,World,1950,population,male,all,medium,0.993145
1,World,1951,population,male,all,medium,0.993584
2,World,1952,population,male,all,medium,0.994248
3,World,1953,population,male,all,medium,0.994741
4,World,1954,population,male,all,medium,0.995454
...,...,...,...,...,...,...,...
76318,Wallis and Futuna,2099,population,male,all,high,1.010352
76319,Wallis and Futuna,2099,population,male,all,medium,0.990233
76320,Wallis and Futuna,2100,population,male,all,low,0.961562
76321,Wallis and Futuna,2100,population,male,all,high,1.010870


In [32]:
df_male

Unnamed: 0,location,year,metric,sex,age,variant,value
3434535,World,1950,population,male,all,medium,1.245363e+09
3434538,World,1951,population,male,all,medium,1.267473e+09
3434541,World,1952,population,male,all,medium,1.291400e+09
3434544,World,1953,population,male,all,medium,1.316659e+09
3434547,World,1954,population,male,all,medium,1.342923e+09
...,...,...,...,...,...,...,...
3663487,Wallis and Futuna,2099,population,male,all,high,7.320000e+03
3663488,Wallis and Futuna,2099,population,male,all,medium,4.968000e+03
3663495,Wallis and Futuna,2100,population,male,all,low,3.177000e+03
3663496,Wallis and Futuna,2100,population,male,all,high,7.347000e+03


In [3]:
from owid import catalog
import json
from pathlib import Path

In [4]:
import pandas as pd
from etl.paths import BASE_DIR as base_path

# Load

In [5]:
meadow_path = base_path / "data/meadow/un/2022/un_wpp"
ds = catalog.Dataset(meadow_path)

In [29]:
# country rename
country_std = (
    pd.read_csv("un_wpp.country_std.csv", index_col="Country").squeeze().to_dict()
)

## Population

## Fertility

In [30]:
df = ds["fertility"]

In [31]:
# rename columns
columns_id = {
    "location": "location",
    "time": "year",
    "variant": "variant",
    "agegrp": "age",
}
columns_metrics = {
    "asfr": {
        "name": "fertility_rate",
        "sex": "all",
        "operation": lambda x: x,  # (x).round(2),
    },
    "births": {
        "name": "births",
        "sex": "all",
        "operation": lambda x: (x * 1000),
    },
}
columns_order = ["location", "year", "metric", "sex", "age", "variant", "value"]

In [32]:
# Unpivot
df = df.reset_index()
df = df.melt(columns_id.keys(), columns_metrics.keys(), "metric", "value")

In [33]:
# Add columns, rename columns
df = df.rename(columns=columns_id)
df = df.assign(
    metric=df.metric.map({k: v["name"] for k, v in columns_metrics.items()}),
    sex=df.metric.map({k: v["sex"] for k, v in columns_metrics.items()}),
    variant=df.variant.apply(lambda x: x.lower()),
    location=df.location.map(country_std),
)
# Column order
df = df[columns_order]
# Discard unmapped regions
df = df.dropna(subset=["location"])

In [34]:
# Scale units
ops = {v["name"]: v.get("operation", lambda x: x) for k, v in columns_metrics.items()}
for m in df.metric.unique():
    df.loc[df.metric == m, "value"] = ops[m](df.loc[df.metric == m, "value"])

## Demographics

In [36]:
df = ds["demographics"]

In [40]:
# rename columns
columns_id = {
    "location": "location",
    "time": "year",
    "variant": "variant",
}
columns_metrics = {
    "popdensity": {
        "name": "population_density",
        "sex": "all",
        "age": "all",
    },
    "popgrowthrate": {
        "name": "growth_rate",
        "sex": "all",
        "age": "all",
        "operation": lambda x: (x).round(2),
    },
    "natchangert": {
        "name": "growth_natural_rate",
        "sex": "all",
        "age": "all",
        "operation": lambda x: (x / 10).round(2),
    },
    "births": {
        "name": "births",
        "sex": "all",
        "age": "all",
        "operation": lambda x: (x * 1000),
    },
    "cbr": {
        "name": "birth_rate",
        "sex": "all",
        "age": "all",
    },
    "tfr": {
        "name": "fertility_rate",
        "sex": "all",
        "age": "all",
    },
    "deaths": {
        "name": "deaths",
        "sex": "all",
        "age": "all",
        "operation": lambda x: (x * 1000).round(0),
    },
    "deathsfemale": {
        "name": "deaths",
        "sex": "female",
        "age": "all",
        "operation": lambda x: (x * 1000).round(0),
    },
    "deathsmale": {
        "name": "deaths",
        "sex": "male",
        "age": "all",
        "operation": lambda x: (x * 1000).round(0),
    },
    "cdr": {
        "name": "death_rate",
        "sex": "all",
        "age": "all",
    },
    "medianagepop": {
        "name": "median_age",
        "sex": "all",
        "age": "all",
        "operation": lambda x: (x).round(1),
    },
    "lex": {
        "name": "life_expectancy",
        "sex": "all",
        "age": "at birth",
        "operation": lambda x: (x).round(1),
    },
    "lexfemale": {
        "name": "life_expectancy",
        "sex": "female",
        "age": "at birth",
        "operation": lambda x: (x).round(1),
    },
    "lexmale": {
        "name": "life_expectancy",
        "sex": "male",
        "age": "at birth",
        "operation": lambda x: (x).round(1),
    },
    "le15": {
        "name": "life_expectancy",
        "sex": "all",
        "age": "15",
        "operation": lambda x: (15 + x).round(1),
    },
    "le15female": {
        "name": "life_expectancy",
        "sex": "female",
        "age": "15",
        "operation": lambda x: (15 + x).round(1),
    },
    "le15male": {
        "name": "life_expectancy",
        "sex": "male",
        "age": "15",
        "operation": lambda x: (15 + x).round(1),
    },
    "le65": {
        "name": "life_expectancy",
        "sex": "all",
        "age": "65",
        "operation": lambda x: (65 + x).round(1),
    },
    "le65female": {
        "name": "life_expectancy",
        "sex": "female",
        "age": "65",
        "operation": lambda x: (65 + x).round(1),
    },
    "le65male": {
        "name": "life_expectancy",
        "sex": "male",
        "age": "65",
        "operation": lambda x: (65 + x).round(1),
    },
    "le80": {
        "name": "life_expectancy",
        "sex": "all",
        "age": "80",
        "operation": lambda x: (80 + x).round(1),
    },
    "le80female": {
        "name": "life_expectancy",
        "sex": "female",
        "age": "80",
        "operation": lambda x: (80 + x).round(1),
    },
    "le80male": {
        "name": "life_expectancy",
        "sex": "male",
        "age": "80",
        "operation": lambda x: (80 + x).round(1),
    },
    "srb": {
        "name": "sex_ratio",
        "sex": "none",
        "age": "at birth",
        "operation": lambda x: (x).round(2),
    },
    "netmigrations": {
        "name": "net_migration",
        "sex": "all",
        "age": "all",
        "operation": lambda x: (x * 1000),
    },
    "cnmr": {
        "name": "net_migration_rate",
        "sex": "all",
        "age": "all",
    },
    "imr": {
        "name": "infant_mortality_rate",
        "sex": "all",
        "age": "all",
        "operation": lambda x: (x / 10),
    },
    "q5": {
        "name": "child_mortality_rate",
        "sex": "all",
        "age": "all",
        "operation": lambda x: (x / 10),
    },
}
columns_order = ["location", "year", "metric", "sex", "age", "variant", "value"]

In [41]:
# Unpivot
df = df.reset_index()
df = df.melt(columns_id.keys(), columns_metrics.keys(), "metric", "value")

In [42]:
# Add columns, rename columns
df = df.rename(columns=columns_id)
df = df.assign(
    # metric=df_4.metric.map({k: v["name"] for k, v in columns_metrics.items()}),
    sex=df.metric.map({k: v["sex"] for k, v in columns_metrics.items()}),
    age=df.metric.map({k: v["age"] for k, v in columns_metrics.items()}),
    variant=df.variant.apply(lambda x: x.lower()),
    location=df.location.map(country_std),
)
# Scale units
ops = {k: v.get("operation", lambda x: x) for k, v in columns_metrics.items()}
for m in df.metric.unique():
    df.loc[df.metric == m, "value"] = ops[m](df.loc[df.metric == m, "value"])
# Metric name
df = df.assign(metric=df.metric.map({k: v["name"] for k, v in columns_metrics.items()}))
# Column order
df = df[columns_order]
# Discard unmapped regions
df = df.dropna(subset=["location"])

## dependency ratio

In [110]:
df = ds["dependency_ratio"]
df = df.reset_index()

In [111]:
# Initial settings
columns_id = {
    "location": "location",
    "time": "year",
    "variant": "variant",
    "sex": "sex",
}
columns_metrics = {
    "annual_total_dep__ratio__0_14__and__65plus__15_64__pct": {
        "name": "dependency_ratio_total",
    },
    "annual_child_dep__ratio__0_14__15_64__pct": {
        "name": "dependency_ratio_child",
    },
    "annual_old_age_dep__ratio__65plus__15_64__pct": {
        "name": "dependency_ratio_old",
    },
}
mapping_sex = {
    "Both": "all",
    "Female": "female",
    "Male": "male",
}
columns_order = ["location", "year", "metric", "sex", "age", "variant", "value"]

In [112]:
df = df.melt(columns_id.keys(), columns_metrics.keys(), "metric", "value")

In [113]:
# Add columns, rename columns
df = df.rename(columns=columns_id)
df = df.assign(
    age="none",
    sex=df.sex.map(mapping_sex),
    metric=df.metric.map({k: v["name"] for k, v in columns_metrics.items()}),
    variant=df.variant.apply(lambda x: x.lower()),
    location=df.location.map(country_std),
    value=(df.value).astype(float).round(2),
)

In [114]:
# Column order
df = df[columns_order]
# Drop unmapped regions
df = df.dropna(subset=["location"])

In [115]:
df

Unnamed: 0,location,year,metric,sex,age,variant,value
0,World,1950,dependency_ratio_total,female,none,estimates,65.40
1,World,1951,dependency_ratio_total,female,none,estimates,65.82
2,World,1952,dependency_ratio_total,female,none,estimates,66.37
3,World,1953,dependency_ratio_total,female,none,estimates,67.05
4,World,1954,dependency_ratio_total,female,none,estimates,67.84
...,...,...,...,...,...,...,...
692464,Wallis and Futuna,2096,dependency_ratio_old,all,none,low,93.66
692465,Wallis and Futuna,2097,dependency_ratio_old,all,none,low,94.40
692466,Wallis and Futuna,2098,dependency_ratio_old,all,none,low,95.13
692467,Wallis and Futuna,2099,dependency_ratio_old,all,none,low,96.02


## Deaths

In [88]:
df = ds["deaths"]

In [89]:
df = df.reset_index()

In [90]:
# Initial settings
columns_id = {
    "location": "location",
    "time": "year",
    "variant": "variant",
    "sex": "sex",
}
columns_metrics = {
    **{
        f"_{i}": {
            "name": "deaths",
            "age": f"{i}",
        }
        for i in range(100)
    },
    **{
        "_100plus": {
            "name": "deaths",
            "age": "100+",
        }
    },
}
mapping_sex = {
    "Both": "all",
    "Female": "female",
    "Male": "male",
}
columns_order = ["location", "year", "metric", "sex", "age", "variant", "value"]

In [91]:
# Melt
df = df.melt(columns_id.keys(), columns_metrics.keys(), "metric", "value")

In [92]:
# Add columns, rename columns
df = df.rename(columns=columns_id)
df = df.assign(
    sex=df.sex.map(mapping_sex),
    age=df.metric.map({k: v["age"] for k, v in columns_metrics.items()}),
    variant=df.variant.apply(lambda x: x.lower()),
    location=df.location.map(country_std),
    metric="deaths",
    value=(df.value * 1000).astype(int),
)

In [93]:
# Column order
df = df[columns_order]
# Drop unmapped regions
df = df.dropna(subset=["location"])

In [94]:
df

Unnamed: 0,location,year,metric,sex,age,variant,value
0,World,1950,deaths,all,0,estimates,13109145
1,World,1951,deaths,all,0,estimates,13086528
2,World,1952,deaths,all,0,estimates,13215704
3,World,1953,deaths,all,0,estimates,13117346
4,World,1954,deaths,all,0,estimates,13122414
...,...,...,...,...,...,...,...
23313118,Wallis and Futuna,2096,deaths,female,100+,low,9
23313119,Wallis and Futuna,2097,deaths,female,100+,low,9
23313120,Wallis and Futuna,2098,deaths,female,100+,low,8
23313121,Wallis and Futuna,2099,deaths,female,100+,low,8
