# Chinese exports by year, country and commodity

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
from datetime import timedelta
import numpy as np
import glob

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

### Get commodity codes lookup

In [4]:
code_src = pd.read_csv(
    "https://raw.githubusercontent.com/datasets/harmonized-system/master/data/harmonized-system.csv",
    dtype={"level": str},
)

In [5]:
code_df = code_src[(code_src["parent"] == "TOTAL") | (code_src["level"] == "4")][
    ["hscode", "description", "level"]
]

In [6]:
code_df.rename(columns={"hscode": "code"}, inplace=True)

In [7]:
code_df

Unnamed: 0,code,description,level
0,1,Animals; live,2
1,101,"Horses, asses, mules and hinnies; live",4
6,102,Bovine animals; live,4
12,103,Swine; live,4
16,104,Sheep and goats; live,4


---

### Get country codes

In [8]:
countries = pd.read_csv("../data/raw/TradingPartner.csv", encoding="GBK")

In [9]:
countries.rename(
    columns={
        "CODES": "trade_partner_code",
        "DESCRIPTION": "trade_partner_name",
    },
    inplace=True,
)

### Get exporting provinces

In [10]:
places = pd.read_csv("../data/raw/TradeCoPort.csv", encoding="GBK")

In [11]:
places.rename(
    columns={
        "CODE": "registration_place_code",
        "DESCRIPTION": "registration_place_name",
    },
    inplace=True,
)

---

### Read all the export CSVs

In [12]:
path = "../data/raw/exports/all-countries/"
all_files = glob.glob(path + "*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(
        filename,
        encoding="GBK",
        dtype={"数据年月\t": str, "商品编码\t": str},
    ).assign(year=filename)
    li.append(df)

### And concatenate them into one frame

In [13]:
raw = pd.concat(
    li,
    axis=0,
    ignore_index=True,
)

### Clean up column headers

In [14]:
raw.columns = (
    raw.columns.str.replace(r"\t", "", regex=True)
    .str.replace(" ", "_", regex=False)
    .str.lower()
)

### Remove all the tabs in the rows/columns

In [15]:
raw = raw.replace(r"\t\t", "", regex=True)

### Rename colums 

In [16]:
raw.rename(
    columns={
        "商品编码": "commodity_code",
        "商品名称": "commodity_name",
        "贸易伙伴编码": "trade_partner_code",
        "贸易伙伴名称": "trade_partner_name",
        "注册地编码": "registration_place_code",
        "注册地名称": "registration_place_name",
        "第一数量": "first_quantity",
        "第一计量单位": "first_unit_of_measure",
        "第二数量": "second_quantity",
        "第二计量单位": "second_unit_of_measure",
        "美元": "us_dollar",
    },
    inplace=True,
)

In [17]:
raw["year"] = (
    raw["year"]
    .str.replace(
        "../data/raw/exports/all-countries/XJ exports to all countries", "", regex=False
    )
    .str.replace("Chinese raw .csv", "", regex=False)
    .str.replace(
        "../data/raw/exports/all-countries/XJ exports to all countries", "", regex=False
    )
    .str.replace("Chinese raw.csv", "", regex=False)
)

### Create commodity code columns based on slices of others

In [18]:
raw["commodity_chapter"] = raw["commodity_code"].str[:2]
raw["commodity_chapter_four"] = raw["commodity_code"].str[:4]

In [19]:
raw["us_dollar"] = raw["us_dollar"].str.replace(",", "", regex=False).astype(int)

In [20]:
raw.head()

Unnamed: 0,registration_place_code,registration_place_name,commodity_code,commodity_name,trade_partner_code,trade_partner_name,first_quantity,first_unit_of_measure,second_quantity,second_unit_of_measure,us_dollar,unnamed:_11,year,commodity_chapter,commodity_chapter_four
0,65,新疆维吾尔自治区,2071419,其他冻鸡块,146,吉尔吉斯斯坦,144000,千克,0,?,259200,,2019,2,207
1,65,新疆维吾尔自治区,2074500,冻的鸭块及杂碎,146,吉尔吉斯斯坦,1615000,千克,0,?,3584460,,2019,2,207
2,65,新疆维吾尔自治区,2074500,冻的鸭块及杂碎,147,塔吉克斯坦,2686000,千克,0,?,3601100,,2019,2,207
3,65,新疆维吾尔自治区,2074500,冻的鸭块及杂碎,149,乌兹别克斯坦,44000,千克,0,?,82280,,2019,2,207
4,65,新疆维吾尔自治区,3011100,淡水观赏鱼,145,哈萨克斯坦,113,千克,0,?,1628,,2019,3,301


### Merge the commodity codes with our export data and remove columns we don't need

In [21]:
merge = raw.merge(code_df, left_on="commodity_chapter", right_on="code")

In [22]:
merge.head(1)

Unnamed: 0,registration_place_code,registration_place_name,commodity_code,commodity_name,trade_partner_code,trade_partner_name,first_quantity,first_unit_of_measure,second_quantity,second_unit_of_measure,us_dollar,unnamed:_11,year,commodity_chapter,commodity_chapter_four,code,description,level
0,65,新疆维吾尔自治区,2071419,其他冻鸡块,146,吉尔吉斯斯坦,144000,千克,0,?,259200,,2019,2,207,2,Meat and edible meat offal,2


In [23]:
merge.drop(
    [
        "commodity_name",
        "unnamed:_11",
        "code",
    ],
    axis=1,
    inplace=True,
)

In [24]:
merge.rename(columns={"description": "commodity_chapter_desc"}, inplace=True)

In [25]:
merge_df = pd.merge(
    merge,
    code_df[["code", "description"]],
    left_on="commodity_chapter_four",
    right_on="code",
)

In [26]:
merge_df.drop(["code", "level"], axis=1, inplace=True)

In [27]:
merge_df.rename(
    columns={
        "description": "commodity_chapter_four_desc",
    },
    inplace=True,
)

### Behold, a dataframe

In [28]:
merged_df = merge_df[
    [
        "year",
        "registration_place_code",
        "registration_place_name",
        "trade_partner_code",
        "trade_partner_name",
        "commodity_code",
        "commodity_chapter",
        "commodity_chapter_desc",
        "commodity_chapter_four",
        "commodity_chapter_four_desc",
        "first_unit_of_measure",
        "second_quantity",
        "second_unit_of_measure",
        "us_dollar",
    ]
].copy()

In [29]:
df = pd.merge(merged_df, countries, on=["trade_partner_code"])

In [30]:
df = pd.merge(df, places, on=["registration_place_code"])

In [31]:
df.drop(
    [
        "registration_place_name_x",
        "trade_partner_code",
        "registration_place_code",
        "trade_partner_name_x",
        "first_unit_of_measure",
        "second_quantity",
        "second_unit_of_measure",
    ],
    axis=1,
    inplace=True,
)
df.rename(
    columns={
        "trade_partner_name_y": "exported_to",
        "registration_place_name_y": "exported_from",
    },
    inplace=True,
)

In [32]:
df["exported_to"] = df["exported_to"].str.replace(
    "Russian Federation", "Russia", regex=False
)
df["exported_to"] = df["exported_to"].str.replace(
    "Korea Rep.", "South Korea", regex=False
)

df["exported_to"] = df["exported_to"].str.replace(
    "United States of America", "United States", regex=False
)
df["exported_to"] = df["exported_to"].str.replace(
    "Syria Arab Republic", "Syria", regex=False
)

---

### Export

In [33]:
df.to_csv("../data/processed/exports_xingjiang_all_countries.csv", index=False)