# Births and deaths in China

### Import Python tools and Jupyter configuration

In [2]:
%load_ext lab_black

In [3]:
import pandas as pd
import geopandas as gpd
import altair as alt
from datetime import timedelta
import numpy as np

In [4]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

### Grab table from Wikipedia

In [135]:
src = pd.read_html("https://en.wikipedia.org/wiki/Demographics_of_China")[5]

In [136]:
src.rename(
    columns={
        "Unnamed: 0": "year",
        "Midyear population": "population",
        "Live births1": "births",
        "Deaths1": "deaths",
        "Natural change1": "change",
        "Crude birth rate (per 1000)": "birth_rate",
        "Crude death rate (per 1000)": "death_rate",
        "Natural change (per 1000)": "change_rate",
        "Total fertility rate": "fertility_rate",
    },
    inplace=True,
)

In [137]:
src["year"] = src["year"].str.replace("[16]", "", regex=False).astype(str)
src["year"] = pd.to_datetime(src["year"])
src["change"] = pd.to_numeric(src.change, errors="coerce").astype(float)
src["birth_rate"] = src["birth_rate"].astype(float)
src["death_rate"] = src["death_rate"].astype(float)
src["change_rate"] = pd.to_numeric(src.change_rate, errors="coerce").astype(float)
src["fertility_rate"] = (
    src["fertility_rate"].str.replace("[17]", "", regex=False).astype(float)
)

In [138]:
df = src.sort_values("year", ascending=False).copy()

In [139]:
df.head()

Unnamed: 0,year,population,births,deaths,change,birth_rate,death_rate,change_rate,fertility_rate
71,2020-01-01,1411100000,12050000,10010000,2040000.0,8.54,7.09,1.45,1.3
70,2019-01-01,1407745000,14650000,9980000,4670000.0,10.41,7.08,3.33,
69,2018-01-01,1402760000,15230000,9930000,5300000.0,10.86,7.07,3.79,
68,2017-01-01,1396215000,17230000,9860000,7370000.0,12.34,7.04,5.3,
67,2016-01-01,1387790000,17860000,9770000,8090000.0,12.87,7.02,5.85,


In [140]:
df.columns

Index(['year', 'population', 'births', 'deaths', 'change', 'birth_rate',
       'death_rate', 'change_rate', 'fertility_rate'],
      dtype='object')

---

### Melt dataframe for charting

In [141]:
df_melt = pd.melt(
    df,
    id_vars="year",
    value_vars=[
        "births",
        "deaths",
        "change",
        "birth_rate",
        "death_rate",
        "change_rate",
        "fertility_rate",
    ],
    var_name="measure",
    value_name="value",
)

In [142]:
df_melt.measure.value_counts()

change            72
births            72
deaths            72
change_rate       72
fertility_rate    72
birth_rate        72
death_rate        72
Name: measure, dtype: int64

In [143]:
rates = ["birth_rate", "death_rate"]
raw = ["births", "deaths"]

In [147]:
alt.Chart(df_melt[df_melt["measure"] == "fertility_rate"]).mark_bar(size=10).encode(
    x="year:T", y="value", color="measure",
).properties(width=650, title="Fertility rate in China")

In [150]:
alt.Chart(df_melt[df_melt["measure"].isin(rates)]).mark_line(size=3).encode(
    x=alt.X("year:T", axis=alt.Axis(tickCount=9), title="Year"),
    y=alt.Y("value", axis=alt.Axis(tickCount=5, title="Per 100,000")),
    color="measure",
).properties(width=650, title="Birth rate vs. death rate in China")

---

### Export

In [70]:
df.to_csv("data/processed/china_births_deaths_rates.csv", index=False)