In [37]:
from typing import List


import pandas as pd
from etl.paths import BASE_DIR as base_path

from owid import catalog
from owid.catalog import Table
from owid.catalog.meta import TableMeta

YEAR_SPLIT = 2022

In [38]:
from un_wpp import *

In [39]:
meadow_path = base_path / "data/meadow/un/2022/un_wpp"
ds = catalog.Dataset(meadow_path)

In [40]:
country_std = load_country_mapping()

In [41]:
from population import process as process_population
from fertility import process as process_fertility
from demographics import process as process_demographics
from dep_ratio import process as process_depratio
from deaths import process as process_deaths

## deaths

In [42]:
from deaths import *
from dtypes import optimize_dtypes

In [43]:
df = ds["deaths"]

In [44]:
df = df.reset_index()
# Melt
df = df.melt(COLUMNS_ID.keys(), COLUMNS_METRICS.keys(), "metric", "value")
# Add columns, rename columns
df = df.rename(columns=COLUMNS_ID)
df = df.assign(
    sex=df.sex.map(MAPPING_SEX),
    age=df.metric.map({k: v["age"] for k, v in COLUMNS_METRICS.items()}),
    variant=df.variant.apply(lambda x: x.lower()),
    location=df.location.map(country_std),
    metric="deaths",
    value=(df.value * 1000).astype(int),
)

In [45]:
df.isna().sum()

location    187254
year             0
variant          0
sex              0
metric           0
value            0
age              0
dtype: int64

In [29]:
df = optimize_dtypes(df, simple=True)

In [31]:
df.memory_usage(deep=True) / 1e6

Index         0.000128
location     46.650980
year         46.626246
variant      23.313545
sex          23.313415
metric       23.313294
value       186.504984
age          23.323242
dtype: float64

In [25]:
df.dtypes

location      object
year          uint64
variant     category
sex         category
metric        object
value          int64
age           object
dtype: object

In [26]:
df_0 = df[df.age == "0"].copy()

In [27]:
df_0.memory_usage(deep=True) / 1e6

Index        1.846584
location    15.297354
year         1.846584
variant      0.231245
sex          0.231115
metric      14.541849
value        1.846584
age         13.387734
dtype: float64

In [32]:
df = add_age_groups(df)

In [36]:
df[df.location.isna()]

Unnamed: 0,location,year,metric,sex,age,variant,value
2747772,,1950,deaths,all,0,estimates,10915307
2747773,,1951,deaths,all,0,estimates,10993599
2747774,,1952,deaths,all,0,estimates,11209735
2747775,,1953,deaths,all,0,estimates,11158563
2747776,,1954,deaths,all,0,estimates,11206141
...,...,...,...,...,...,...,...
2959328,,2096,deaths,female,0,low,176
2959329,,2097,deaths,female,0,low,171
2959330,,2098,deaths,female,0,low,166
2959331,,2099,deaths,female,0,low,161


In [14]:
df.dtypes

location      object
year          uint64
metric        object
sex         category
age           object
variant     category
value          int64
dtype: object

In [8]:
df_deaths.isna().sum()

location    0
year        0
metric      0
sex         0
age         0
variant     0
value       0
dtype: int64

In [9]:
df_deaths.variant.unique()

['estimates', 'medium', 'high', 'low']
Categories (11, object): ['estimates', 'medium', 'high', 'low', ..., 'constant mortality', 'no change', 'momentum', 'instant replacement zero migration']

## demographics

In [10]:
df_demographics = process_demographics(ds["demographics"], country_std)

In [11]:
df_demographics.memory_usage(deep=True).sum() / 1e6

127.198849

In [12]:
df_demographics.isna().sum()

location        0
year            0
metric          0
sex             0
age             0
variant         0
value       69440
dtype: int64

## fertility

In [13]:
df_fertility = process_fertility(ds["fertility"], country_std)

In [14]:
df_fertility.memory_usage(deep=True).sum() / 1e6

73.145899

In [15]:
df_fertility.isna().sum()

location    0
year        0
metric      0
sex         0
age         0
variant     0
value       0
dtype: int64

## dep ratio

In [16]:
df_depratio = process_depratio(ds["dependency_ratio"], country_std)

In [17]:
df_depratio.memory_usage(deep=True).sum() / 1e6

17.212996

In [18]:
df_depratio.isna().sum()

location    0
year        0
metric      0
sex         0
age         0
variant     0
value       0
dtype: int64

## population

In [19]:
df_population = process_population(ds["population"], country_std)

In [20]:
df_population.memory_usage(deep=True).sum() / 1e6

120.626343

In [21]:
df_population.isna().sum()

location       0
year           0
metric         0
sex            0
age            0
variant        0
value       7202
dtype: int64

## merge

In [22]:
# merge
df = merge_dfs([df_population, df_fertility, df_demographics, df_depratio, df_deaths])

41
42
46


In [39]:
table_long = df_to_table(
    df,
    short_name="long",
    description=(
        "Main UN WPP dataset by OWID. It comes in 'long' format, i.e. column"
        " 'metric' gives the metric name and column 'value' its corresponding"
        " value."
    ),
)

In [41]:
metric_categories = {
    "migration": [
        "net_migration",
        "net_migration_rate",
    ],
    "fertility": [
        "fertility_rate",
        "births",
        "birth_rate",
    ],
    "population": [
        "population",
        "population_density",
        "population_change",
        "population_broad",
    ],
    "mortality": [
        "deaths",
        "death_rate",
        "life_expectancy",
        "child_mortality_rate",
        "infant_mortality_rate",
    ],
    "demographic": [
        "median_age",
        "growth_natural_rate",
        "growth_rate",
        "sex_ratio",
    ],
}

In [42]:
tables = []
for category, metrics in metric_categories.items():
    print(category)
    df_c = df.query(f"metric in {metrics}")
    tables.append(
        df_to_table(
            df_c,
            short_name=category,
            description=(
                "UN WPP dataset by OWID. Contains only metrics corresponding to"
                f" sub-group {category}"
            ),
        )
    )

migration
fertility
population
mortality
demographic


In [43]:
tables += [table_long]

In [45]:
dataset_to_garden(tables, ds.metadata, "./test")

In [31]:
c = [
    "net_migration",
    "net_migration_rate",
]

In [37]:
f"metric in {c}"

"metric in ['net_migration', 'net_migration_rate']"

In [38]:
df.query(f"metric in {c}")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,value
location,year,metric,sex,age,variant,Unnamed: 6_level_1
World,1950,net_migration,all,all,estimates,0.0
World,1951,net_migration,all,all,estimates,0.0
World,1952,net_migration,all,all,estimates,0.0
World,1953,net_migration,all,all,estimates,0.0
World,1954,net_migration,all,all,estimates,0.0
...,...,...,...,...,...,...
Wallis and Futuna,2096,net_migration_rate,all,all,instant replacement zero migration,0.0
Wallis and Futuna,2097,net_migration_rate,all,all,instant replacement zero migration,0.0
Wallis and Futuna,2098,net_migration_rate,all,all,instant replacement zero migration,0.0
Wallis and Futuna,2099,net_migration_rate,all,all,instant replacement zero migration,0.0


In [23]:
df_wide = get_wide_df(df)

In [89]:
dfs = [df_population, df_fertility, df_demographics, df_depratio, df_deaths]

In [90]:
df = pd.concat(dfs, ignore_index=True)

In [91]:
df.memory_usage(deep=True).sum() / 1e6

724.665449

In [29]:
df.reset_index().metric.unique().tolist()

['sex_ratio',
 'population',
 'population_broad',
 'population_change',
 'fertility_rate',
 'births',
 'population_density',
 'growth_rate',
 'growth_natural_rate',
 'birth_rate',
 'deaths',
 'death_rate',
 'median_age',
 'life_expectancy',
 'net_migration',
 'net_migration_rate',
 'infant_mortality_rate',
 'child_mortality_rate',
 'dependency_ratio_total',
 'dependency_ratio_child',
 'dependency_ratio_old']

In [92]:
df.loc[df.year < YEAR_SPLIT, "variant"] = "estimates"

In [87]:
df = df.set_index(["location", "year", "metric", "sex", "age", "variant"])

In [72]:
df.memory_usage(deep=True).sum() / 1e6

724.671824

In [73]:
df = df.dropna(subset=["value"])

In [74]:
df.memory_usage(deep=True).sum() / 1e6

723.36891

In [66]:
df.memory_usage(deep=True).sum() / 1e6

723.368902

---

In [15]:
from population import *
from population import (
    _add_metric_sexratio,
    _add_metric_population,
    _add_metric_population_change,
)

In [16]:
df = pd.DataFrame(ds["population"])
df = df.reset_index()
df = df.assign(location=df.location.map(country_std).astype("category"))
# Discard unmapped regions
df = df.dropna(subset=["location"])
# Estimate sex_ratio
df = df.assign(sex_ratio=(100 * df.popmale / df.popfemale).round(2))
# Unpivot
df = df.melt(COLUMNS_ID.keys(), COLUMNS_METRICS.keys(), "metric", "value")
# Rename columns
df = df.rename(columns=COLUMNS_ID)
# dtypes
df = df.astype({"metric": "category", "year": "uint16"})
# Scale units
ops = {k: v.get("operation", lambda x: x) for k, v in COLUMNS_METRICS.items()}
for m in df.metric.unique():
    df.loc[df.metric == m, "value"] = ops[m](df.loc[df.metric == m, "value"])
# Column value mappings
df = df.assign(
    metric=df.metric.map({k: v["name"] for k, v in COLUMNS_METRICS.items()}).astype(
        "category"
    ),
    sex=df.metric.map({k: v["sex"] for k, v in COLUMNS_METRICS.items()}).astype(
        "category"
    ),
    variant=df.variant.apply(lambda x: x.lower()).astype("category"),
)
# Column order
df = df[COLUMNS_ORDER]

In [17]:
df.memory_usage(deep=True).sum() / 1e6

370.049835

In [18]:
df.dtypes

location    category
year          uint16
metric      category
sex         category
age         category
variant     category
value        float32
dtype: object

In [19]:
from dtypes import optimize_dtypes

In [20]:
df_sr = _add_metric_sexratio(df)
df_p_granular, df_p_broad = _add_metric_population(df)
df_p_diff = _add_metric_population_change(df_p_granular)

In [23]:
df_sr = optimize_dtypes(df_sr)
df_p_granular = optimize_dtypes(df_p_granular)
df_p_broad = optimize_dtypes(df_p_broad)
df_p_diff = optimize_dtypes(df_p_diff)

In [25]:
df_sr.memory_usage(deep=True).sum() / 1e6

17.670934

In [26]:
df_p_granular.memory_usage(deep=True).sum() / 1e6

47.666001

In [27]:
df_p_broad.memory_usage(deep=True).sum() / 1e6

14.923434

In [28]:
df_p_diff.memory_usage(deep=True).sum() / 1e6

76.226977

In [29]:
x = pd.concat([df_sr, df_p_granular, df_p_broad, df_p_diff], ignore_index=True)

In [31]:
x.memory_usage(deep=True).sum() / 1e6

120.626343

In [37]:
df_p_granular.shape

(3663504, 7)

In [42]:
pop_diff = (
    df_p_granular.sort_values("year")
    .groupby(["location", "sex", "age", "variant"])[["value"]]
    .diff()
    .assign(metric="population_change")
    .astype({"metric": "category"})
)

In [46]:
pop_diff.shape

(3663504, 2)

In [47]:
df_p_granular.shape

(3663504, 7)

In [48]:
df_p_diff = pd.concat(
    [
        df_p_granular[
            [col for col in df_p_granular.columns if col not in ["value", "metric"]]
        ],
        pop_diff,
    ],
    axis=1,
).dropna(subset="value")

In [49]:
df_p_diff.shape

(3627936, 7)

In [50]:
df_p_diff.memory_usage(deep=True).sum() / 1e6

72.58588

---

In [6]:
from population import *
from population import (
    _add_metric_sexratio,
    _add_metric_population,
    _add_metric_population_change,
)

In [7]:
df = ds["population"]

In [8]:
df.memory_usage(deep=True).sum() / 1e6

287.701873

In [9]:
df = pd.DataFrame(df)
df = df.reset_index()
df = df.assign(location=df.location.map(country_std).astype("category"))
df = df.dropna(subset=["location"]).reset_index(drop=True)

In [10]:
df.memory_usage(deep=True).sum() / 1e6

331.63476

In [11]:
# Estimate sex_ratio
df = df.assign(sex_ratio=(100 * df.popmale / df.popfemale).round(2))
# Unpivot
df = df.melt(COLUMNS_ID.keys(), COLUMNS_METRICS.keys(), "metric", "value")

In [12]:
df = df.rename(columns=COLUMNS_ID)

In [13]:
df = df.astype({"metric": "category", "year": "uint16"})

In [14]:
df.memory_usage(deep=True).sum() / 1e6

339.215118

In [15]:
ops = {k: v.get("operation", lambda x: x) for k, v in COLUMNS_METRICS.items()}
for m in df.metric.unique():
    df.loc[df.metric == m, "value"] = ops[m](df.loc[df.metric == m, "value"])

In [16]:
df.memory_usage(deep=True).sum() / 1e6

339.215118

In [17]:
df = df.assign(
    metric=df.metric.map({k: v["name"] for k, v in COLUMNS_METRICS.items()}).astype(
        "category"
    ),
    sex=df.metric.map({k: v["sex"] for k, v in COLUMNS_METRICS.items()}).astype(
        "category"
    ),
    variant=df.variant.apply(lambda x: x.lower()).astype("category"),
)

In [18]:
df.memory_usage(deep=True).sum() / 1e6

370.049835

In [19]:
df = df[COLUMNS_ORDER]

In [20]:
df_sr = _add_metric_sexratio(df)

In [21]:
df_sr.memory_usage(deep=True).sum() / 1e6

16.826863

In [67]:
df_p = df[df.metric == "population"]
# Basic age groups
age_map = {
    **{str(i): f"{i - i%5}-{i + 4 - i%5}" for i in range(0, 20)},
    **{str(i): f"{i - i%10}-{i + 9 - i%10}" for i in range(20, 100)},
    **{"100+": "100-"},
}
df_p_granular = df_p.assign(age=df_p.age.map(age_map).astype("category"))
df_p_granular = df_p_granular.groupby(
    ["location", "year", "metric", "sex", "age", "variant"],
    as_index=False,
    observed=True,
).sum()
df_p_granular = optimize_dtypes(df_p_granular, simple=True)
# Additional age groups
# <1
df_p_0 = df_p[df_p.age == "0"].copy()
df_p_0 = optimize_dtypes(df_p_0, simple=True)
# 1-4
df_p_1_4 = df_p[df_p.age.isin(["1", "2", "3", "4"])].copy()
df_p_1_4 = (
    df_p_1_4.groupby(
        ["location", "year", "metric", "sex", "variant"],
        as_index=False,
        observed=True,
    )
    .sum()
    .assign(age="1-4")
)
df_p_1_4 = optimize_dtypes(df_p_1_4, simple=True)
# all
df_p_all = (
    df_p.groupby(
        ["location", "year", "metric", "sex", "variant"],
        as_index=False,
        observed=True,
    )
    .value.sum()
    .assign(age="all")
)
df_p_all = optimize_dtypes(df_p_all, simple=True)

In [23]:
df_p_0.memory_usage(deep=True).sum() / 1e6

4.615183

In [24]:
df_p_0.dtypes

location    category
year          uint16
metric      category
sex         category
age         category
variant     category
value        float32
dtype: object

In [25]:
df_p_1_4.memory_usage(deep=True).sum() / 1e6

2.773608

In [26]:
df_p_1_4.dtypes

location    category
year          uint16
metric      category
sex         category
variant     category
value        float32
age         category
dtype: object

In [28]:
df_p_all.memory_usage(deep=True).sum() / 1e6

2.773608

In [29]:
df_p_all.dtypes

location    category
year          uint16
metric      category
sex         category
variant     category
value        float32
age         category
dtype: object

In [30]:
df_p_granular.memory_usage(deep=True).sum() / 1e6

35.746333

In [31]:
df_p_granular.dtypes

location    category
year          uint16
metric      category
sex         category
age         category
variant     category
value        float32
dtype: object

In [68]:
x = pd.concat([df_p_granular, df_p_0, df_p_1_4, df_p_all], ignore_index=True).astype(
    {"age": "category"}
)

In [37]:
x.memory_usage(deep=True).sum() / 1e6

43.989395

In [34]:
x.dtypes

location    category
year          uint16
metric      category
sex         category
age           object
variant     category
value        float32
dtype: object

In [69]:
df_p_broad = df_p.assign(age=df_p.age.map(map_broad_age).astype("category"))

In [70]:
df_p_broad = df_p_broad.groupby(
    ["location", "year", "metric", "sex", "age", "variant"],
    as_index=False,
    observed=True,
).sum()

In [71]:
df_p_broad = df_p.assign(age=df_p.age.map(map_broad_age).astype("category"))
df_p_broad = df_p_broad.groupby(
    ["location", "year", "metric", "sex", "age", "variant"],
    as_index=False,
    observed=True,
).sum()
df_p_broad = df_p_broad.assign(metric="population_broad").astype({"metric": "category"})

In [72]:
df_p_broad.memory_usage(deep=True).sum() / 1e6

13.764369

In [82]:
pop_diff = (
    df_p_granular.sort_values("year")
    .groupby(["location", "sex", "age", "variant"], observed=True)[["value"]]
    .diff()
    .assign(metric="population_change")
    .astype({"metric": "category"})
)
# pop_diff = optimize_dtypes(pop_diff, simple=True)

In [84]:
df_p_diff = pd.concat(
    [
        df_p_granular[
            [col for col in df_p_granular.columns if col not in ["value", "metric"]]
        ],
        pop_diff,
    ],
    axis=1,
).dropna(subset="value")

In [85]:
df_p_diff.memory_usage(deep=True).sum() / 1e6

58.980942

In [61]:
df_p_diff = df_p_diff.astype({"metric": "category"})

In [63]:
df_p_diff.dtypes

location    category
year          uint16
sex         category
age         category
variant     category
value        float32
metric      category
dtype: object

In [59]:
df_p_granular.shape

(2976597, 7)

In [169]:
df_p_granular, df_p_broad = _add_metric_population(df)

['0', '1', '2', '3', '4', ..., '96', '97', '98', '99', '100+']
Length: 101
Categories (101, object): ['0', '1', '10', '100+', ..., '96', '97', '98', '99']
0
1
10
100+
11
12
13
14
15
16
17
18
19
2
20
21
22
23
24
25
26
27
28
29
3
30
31
32
33
34
35
36
37
38
39
4
40
41
42
43
44
45
46
47
48
49
5
50
51
52
53
54
55
56
57
58
59
6
60
61
62
63
64
65
66
67
68
69
7
70
71
72
73
74
75
76
77
78
79
8
80
81
82
83
84
85
86
87
88
89
9
90
91
92
93
94
95
96
97
98
99


In [173]:
df_p_granular = df_p_granular.astype({"age": "category"})

In [174]:
df_p_granular.memory_usage(deep=True).sum() / 1e6

43.989395

In [170]:
df_p_granular.memory_usage(deep=True).sum() / 1e6

164.883251

In [14]:
# Scale units
ops = {k: v.get("operation", lambda x: x) for k, v in COLUMNS_METRICS.items()}
for m in df.metric.unique():
    df.loc[df.metric == m, "value"] = ops[m](df.loc[df.metric == m, "value"])

In [15]:
df = df.rename(columns=COLUMNS_ID)
df = df.assign(
    metric=df.metric.map({k: v["name"] for k, v in COLUMNS_METRICS.items()}),
    sex=df.metric.map({k: v["sex"] for k, v in COLUMNS_METRICS.items()}),
    variant=df.variant.apply(lambda x: x.lower()),
)

In [18]:
df = df[COLUMNS_ORDER]

In [19]:
df = df.dropna(subset=["location"])

In [21]:
df_sr = _add_metric_sexratio(df)

In [23]:
df_p_granular, df_p_broad = _add_metric_population(df)

['0', '1', '2', '3', '4', ..., '96', '97', '98', '99', '100+']
Length: 101
Categories (101, object): ['0', '1', '10', '100+', ..., '96', '97', '98', '99']
0
1
10
100+
11
12
13
14
15
16
17
18
19
2
20
21
22
23
24
25
26
27
28
29
3
30
31
32
33
34
35
36
37
38
39
4
40
41
42
43
44
45
46
47
48
49
5
50
51
52
53
54
55
56
57
58
59
6
60
61
62
63
64
65
66
67
68
69
7
70
71
72
73
74
75
76
77
78
79
8
80
81
82
83
84
85
86
87
88
89
9
90
91
92
93
94
95
96
97
98
99


In [25]:
df_p_broad.age.unique()

array(['0-4', '15-24', '25-64', '5-14', '65-'], dtype=object)

In [None]:
df_population = process_population(ds["population"], country_std)

1


In [None]:
df_population.memory_usage(deep=True).sum() / 1e6

In [8]:
df_population.dtypes

location    category
year        category
metric        object
sex         category
age           object
variant     category
value        float32
dtype: object

In [9]:
df_deaths[df_deaths.value == 0].age.value_counts()

9     66871
10    66544
8     66290
11    65561
7     64867
      ...  
79     2692
83     2690
80     2598
82     2581
81     2545
Name: age, Length: 101, dtype: int64

---

In [1]:
from typing import List


import pandas as pd
from etl.paths import BASE_DIR as base_path

from owid import catalog
from owid.catalog import Table
from owid.catalog.meta import TableMeta

YEAR_SPLIT = 2022

In [2]:
from un_wpp import *

In [3]:
print(1)
meadow_path = base_path / "data/meadow/un/2022/un_wpp"
ds = catalog.Dataset(meadow_path)

1


In [42]:
CategoricalDtype(categories=range(1950, 2102))

CategoricalDtype(categories=range(1950, 2101), ordered=False)

In [4]:
country_std = load_country_mapping()

In [12]:
for t in ds.table_names:
    print(t, ds[t].memory_usage(deep=True).sum() / 1e6)

deaths 107.438333
demographics 51.612832
dependency_ratio 32.651681
fertility 68.598587
population 287.701873


## deaths

In [15]:
from deaths import process as process_deaths

In [19]:
ds["deaths"].memory_usage(deep=True).sum() / 1e6

107.438333

In [21]:
df_deaths = process_deaths(ds["deaths"], country_std)

In [22]:
df_deaths.memory_usage(deep=True).sum() / 1e6

4959.8382

In [23]:
ds["deaths"].shape

(230823, 108)

In [24]:
df_deaths.shape

(23125869, 7)

In [33]:
df_deaths = df_deaths.astype(
    {
        "location": "category",
        "metric": "category",
        "age": "category",
        "year": "category",
    }
)

In [34]:
df_deaths.memory_usage(deep=True).sum() / 1e6

555.06197

In [36]:
df_deaths.value

0           13109145
1           13086528
2           13215704
3           13117346
4           13122414
              ...   
23313118           9
23313119           9
23313120           8
23313121           8
23313122           8
Name: value, Length: 23125869, dtype: int64

## fertility

In [None]:
# df_fertility = process_fertility(ds["fertility"], country_std)

In [8]:
import pandas as pd
from typing import Dict, List, Any


# rename columns
COLUMNS_ID: Dict[str, str] = {
    "location": "location",
    "time": "year",
    "variant": "variant",
    "agegrp": "age",
}
COLUMNS_METRICS: Dict[str, Dict[str, Any]] = {
    "asfr": {
        "name": "fertility_rate",
        "sex": "all",
        "operation": lambda x: x,  # (x).round(2),
    },
    "births": {
        "name": "births",
        "sex": "all",
        "operation": lambda x: (x * 1000),
    },
}
COLUMNS_ORDER: List[str] = [
    "location",
    "year",
    "metric",
    "sex",
    "age",
    "variant",
    "value",
]

In [9]:
df = ds["fertility"]

In [63]:
ds["deaths"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,notes,locid,iso3_code,iso2_code,sdmx_code,loctypename,parentid,_0,_1,_2,...,_91,_92,_93,_94,_95,_96,_97,_98,_99,_100plus
location,time,variant,sex,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
WORLD,1950,Estimates,Both,,900,,,1.0,World,0,13109.144531,3167.165039,1712.212036,...,80.033997,63.948002,48.561001,33.875000,23.332001,18.841000,14.687,10.589,6.988,8.169
WORLD,1951,Estimates,Both,,900,,,1.0,World,0,13086.528320,3165.274902,1755.954956,...,80.968002,61.313000,46.824001,34.838001,23.205999,15.894000,12.341,9.258,6.472,8.814
WORLD,1952,Estimates,Both,,900,,,1.0,World,0,13215.704102,3110.709961,1723.560059,...,78.932999,60.748001,44.159000,32.875999,23.612000,15.530000,10.346,7.746,5.634,8.841
WORLD,1953,Estimates,Both,,900,,,1.0,World,0,13117.345703,3111.235107,1718.802002,...,78.752998,61.493000,45.933998,32.474998,23.461000,16.488001,10.610,6.896,4.991,8.757
WORLD,1954,Estimates,Both,,900,,,1.0,World,0,13122.414062,3110.542969,1723.839966,...,77.755997,60.405998,45.574001,33.095001,22.931999,16.193001,11.073,6.991,4.446,8.318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wallis and Futuna Islands,2096,Low,Female,2,876,WLF,WF,876.0,Country/Area,957,0.000000,0.000000,0.000000,...,0.003000,0.003000,0.003000,0.003000,0.002000,0.002000,0.001,0.001,0.002,0.009
Wallis and Futuna Islands,2097,Low,Female,2,876,WLF,WF,876.0,Country/Area,957,0.000000,0.000000,0.000000,...,0.003000,0.003000,0.003000,0.003000,0.003000,0.002000,0.002,0.001,0.001,0.009
Wallis and Futuna Islands,2098,Low,Female,2,876,WLF,WF,876.0,Country/Area,957,0.000000,0.000000,0.000000,...,0.003000,0.003000,0.003000,0.003000,0.003000,0.002000,0.002,0.002,0.001,0.008
Wallis and Futuna Islands,2099,Low,Female,2,876,WLF,WF,876.0,Country/Area,957,0.000000,0.000000,0.000000,...,0.003000,0.003000,0.003000,0.003000,0.003000,0.003000,0.002,0.002,0.002,0.008


In [None]:
df_deaths = process_deaths(ds["deaths"], country_std)

In [45]:
df_population = process_population(ds["population"], country_std)
df_fertility = process_fertility(ds["fertility"], country_std)
df_demographics = process_demographics(ds["demographics"], country_std)
df_depratio = process_depratio(ds["dependency_ratio"], country_std)
df_deaths = process_deaths(ds["deaths"], country_std)

In [47]:
dfs = [df_population, df_fertility, df_demographics, df_deaths, df_depratio]

In [53]:
v = []
for x in dfs:
    # print(x.dtypes)
    vv = x.age.unique()
    v.extend(vv)

In [70]:
df_fertility

Unnamed: 0,location,year,metric,sex,age,variant,value
0,World,1950,fertility_rate,all,10-14,medium,4.539000
1,World,1950,fertility_rate,all,15-19,medium,91.083000
2,World,1950,fertility_rate,all,20-24,medium,229.070007
3,World,1950,fertility_rate,all,25-29,medium,240.572998
4,World,1950,fertility_rate,all,30-34,medium,195.835999
...,...,...,...,...,...,...,...
3509401,Wallis and Futuna,2100,births,all,30-34,momentum,54.000000
3509402,Wallis and Futuna,2100,births,all,35-39,momentum,30.000000
3509403,Wallis and Futuna,2100,births,all,40-44,momentum,8.000000
3509404,Wallis and Futuna,2100,births,all,45-49,momentum,0.000000


In [56]:
{
    **{str(v) for v in range(100)},
    **{
        "10-14",
        "15-19",
        "20-24",
        "25-29",
        "30-34",
        "35-39",
        "40-44",
        "45-49",
        "50-54",
    },
    **{
        "0-4",
        "1-4",
        "5-14",
        "15-24",
        "20-29",
        "25-64",
        "30-39",
        "40-49",
        "50-59",
        "60-69",
        "65-",
        "70-79",
        "80-89",
        "90-99",
        "100-",
        "100+",  # borrar
    },
    **{"all", "at birth", "15", "65", "80"},
    **{
        "none",
    },
}

{'0',
 '1',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '2',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '4',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '5',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '6',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '7',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '8',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '9',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99'}

In [54]:
set(v)

{'0',
 '0-4',
 '1',
 '1-4',
 '10',
 '10-14',
 '100+',
 '100-',
 '11',
 '12',
 '13',
 '14',
 '15',
 '15-19',
 '15-24',
 '16',
 '17',
 '18',
 '19',
 '2',
 '20',
 '20-24',
 '20-29',
 '21',
 '22',
 '23',
 '24',
 '25',
 '25-29',
 '25-64',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '30-34',
 '30-39',
 '31',
 '32',
 '33',
 '34',
 '35',
 '35-39',
 '36',
 '37',
 '38',
 '39',
 '4',
 '40',
 '40-44',
 '40-49',
 '41',
 '42',
 '43',
 '44',
 '45',
 '45-49',
 '46',
 '47',
 '48',
 '49',
 '5',
 '5-14',
 '5-9',
 '50',
 '50-54',
 '50-59',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '6',
 '60',
 '60-69',
 '61',
 '62',
 '63',
 '64',
 '65',
 '65-',
 '66',
 '67',
 '68',
 '69',
 '7',
 '70',
 '70-79',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '8',
 '80',
 '80-89',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '9',
 '90',
 '90-99',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 'all',
 'at birth',
 'none'}

In [3]:
print(1)
meadow_path = base_path / "data/meadow/un/2022/un_wpp"
ds = catalog.Dataset(meadow_path)
# country rename
country_std = load_country_mapping()
print(2)
# pocess
df_population = process_population(ds["population"], country_std)
df_fertility = process_fertility(ds["fertility"], country_std)
df_demographics = process_demographics(ds["demographics"], country_std)
df_depratio = process_depratio(ds["dependency_ratio"], country_std)
df_deaths = process_deaths(ds["deaths"], country_std)
print(3)
# merge
df = merge_dfs([df_population, df_fertility, df_demographics, df_depratio, df_deaths])

1
2
3
41
42
43
44
45
46


In [None]:
print(4)
# wide format
df_wide = get_wide_df(df)

4


In [7]:
df_depratio.dtypes

location      object
year          uint64
metric        object
sex         category
age           object
variant     category
value        float64
dtype: object

In [44]:
df_fertility

NameError: name 'df_fertility' is not defined