# Births and deaths in China

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
from datetime import timedelta
import numpy as np

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

### Grab table from Wikipedia

In [4]:
df = pd.read_html("https://en.wikipedia.org/wiki/Demographics_of_China")[5]

In [5]:
src.rename(
    columns={
        "Unnamed: 0": "year",
        "Midyear population": "population",
        "Live births1": "births",
        "Deaths1": "deaths",
        "Natural change1": "change",
        "Crude birth rate (per 1000)": "birth_rate",
        "Crude death rate (per 1000)": "death_rate",
        "Natural change (per 1000)": "change_rate",
        "Total fertility rate": "fertility_rate",
    },
    inplace=True,
)

In [6]:
src.head()

Unnamed: 0,year,population,births,deaths,change,birth_rate,death_rate,change_rate,fertility_rate
0,1949,537371000,19345000,10747000,8598000,36.0,20.0,16.0,
1,1950,546815000,20232000,9843000,10389000,37.0,18.0,19.0,5.29
2,1951,557480000,21073000,9923000,11150000,37.8,17.8,20.0,
3,1952,568910000,21050000,9671000,11379000,37.0,17.0,20.0,
4,1953,581390000,21511000,8139000,13372000,37.0,14.0,23.0,


In [7]:
src["year"] = src["year"].astype(str).str.replace("[16]", "", regex=False)

In [8]:
src["fertility_rate"] = (
    src["fertility_rate"].astype(str).str.replace("[16]", "", regex=False)
)

In [9]:
src.fillna(np.nan, inplace=True)

In [10]:
src["year"] = pd.to_datetime(src["year"])
src["change"] = pd.to_numeric(src.change, errors="coerce").astype(float)
src["birth_rate"] = src["birth_rate"].astype(float)
src["death_rate"] = src["death_rate"].astype(float)
src["change_rate"] = pd.to_numeric(src.change_rate, errors="coerce").astype(float)

In [11]:
df = src.sort_values("year", ascending=False).copy()

In [12]:
df.head()

Unnamed: 0,year,population,births,deaths,change,birth_rate,death_rate,change_rate,fertility_rate
72,2021-01-01,1412600000,10620000,10140000,480000.0,7.52,7.18,0.36,
71,2020-01-01,1411100000,12020000,9970000,2050000.0,8.52,7.07,1.45,1.3
70,2019-01-01,1407745000,14650000,9980000,4670000.0,10.41,7.09,3.32,
69,2018-01-01,1402760000,15230000,9930000,5300000.0,10.86,7.08,3.78,
68,2017-01-01,1396215000,17650000,9860000,7790000.0,12.64,7.06,5.58,


In [13]:
df.columns

Index(['year', 'population', 'births', 'deaths', 'change', 'birth_rate',
       'death_rate', 'change_rate', 'fertility_rate'],
      dtype='object')

---

### Melt dataframe for charting

In [14]:
df_melt = pd.melt(
    df,
    id_vars="year",
    value_vars=[
        "births",
        "deaths",
        "change",
        "birth_rate",
        "death_rate",
        "change_rate",
        "fertility_rate",
    ],
    var_name="measure",
    value_name="value",
)

In [15]:
df_melt.measure.value_counts()

births            73
deaths            73
change            73
birth_rate        73
death_rate        73
change_rate       73
fertility_rate    73
Name: measure, dtype: int64

In [16]:
rates = ["birth_rate", "death_rate"]
raw = ["births", "deaths"]

In [17]:
alt.Chart(df_melt[df_melt["measure"] == "fertility_rate"]).mark_bar(size=10).encode(
    x="year:T",
    y="value:Q",
    color="measure",
).properties(width=650, title="Fertility rate in China")

In [18]:
alt.Chart(df_melt[df_melt["measure"].isin(rates)]).mark_line(size=3).encode(
    x=alt.X("year:T", axis=alt.Axis(tickCount=9), title="Year"),
    y=alt.Y("value", axis=alt.Axis(tickCount=5, title="Per 100,000")),
    color="measure",
).properties(width=650, title="Birth rate vs. death rate in China")

---

### Export

In [19]:
df[df["year"] > "1969"].to_csv(
    "data/processed/china_births_deaths_rates.csv", index=False
)

In [20]:
df.rename(
    columns={"year": "Year", "birth_rate": "Birth rate", "death_rate": "Death rate"},
    inplace=True,
)

In [21]:
df[df["Year"] > "1969"][["Year", "Death rate", "Birth rate"]].to_csv(
    "data/processed/china_births_deaths_rates_datawrapper.csv", index=False
)