# Chinese exports by year, country and commodity

### Import Python tools and Jupyter configuration

In [2]:
%load_ext lab_black

In [3]:
import pandas as pd
import geopandas as gpd
import altair as alt
from datetime import timedelta
import numpy as np
import glob

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

### Get commodity codes lookup

In [5]:
code_src = pd.read_csv(
    "https://raw.githubusercontent.com/datasets/harmonized-system/master/data/harmonized-system.csv",
    dtype={"level": str},
)

In [6]:
code_df = code_src[(code_src["parent"] == "TOTAL") | (code_src["level"] == "4")][
    ["hscode", "description", "level"]
]

In [7]:
code_df.rename(columns={"hscode": "code"}, inplace=True)

---

### Get country codes

In [35]:
countries = pd.read_csv("../data/raw/TradingPartner.csv", encoding="GBK")

In [36]:
countries.rename(
    columns={
        "CODES": "trade_partner_code",
        "DESCRIPTION": "trade_partner_name",
    },
    inplace=True,
)

### Get exporting provinces

In [37]:
places = pd.read_csv("../data/raw/TradeCoPort.csv", encoding="GBK")

In [38]:
places.rename(
    columns={
        "CODE": "registration_place_code",
        "DESCRIPTION": "registration_place_name",
    },
    inplace=True,
)

---

### Read all the export CSVs

In [11]:
path = "../data/raw/exports/all-countries/"
all_files = glob.glob(path + "*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(
        filename,
        encoding="GBK",
        dtype={"数据年月\t": str, "商品编码\t": str},
    ).assign(year=filename)
    li.append(df)

### And concatenate them into one frame

In [12]:
raw = pd.concat(
    li,
    axis=0,
    ignore_index=True,
)

### Clean up column headers

In [13]:
raw.columns = (
    raw.columns.str.replace(r"\t", "", regex=True)
    .str.replace(" ", "_", regex=False)
    .str.lower()
)

### Remove all the tabs in the rows/columns

In [14]:
raw = raw.replace(r"\t\t", "", regex=True)

### Rename colums 

In [15]:
raw.rename(
    columns={
        "数据年月": "date_of_data",
        "商品编码": "commodity_code",
        "商品名称": "commodity_name",
        "贸易伙伴编码": "trade_partner_code",
        "贸易伙伴名称": "trade_partner_name",
        "注册地编码": "registration_place_code",
        "注册地名称": "registration_place_name",
        "第一数量": "first_quantity",
        "第一计量单位": "first_unit_of_measure",
        "第二数量": "second_quantity",
        "第二计量单位": "second_unit_of_measure",
        "美元": "us_dollar",
    },
    inplace=True,
)

In [16]:
raw["year"] = (
    raw["year"]
    .str.replace(
        "../data/raw/exports/all-countries/XJ exports to all countries", "", regex=False
    )
    .str.replace("Chinese raw .csv", "", regex=False)
    .str.replace(
        "../data/raw/exports/all-countries/XJ exports to all countries", "", regex=False
    )
    .str.replace("Chinese raw.csv", "", regex=False)
)

### Create commodity code columns based on slices of others

In [17]:
raw["commodity_chapter"] = raw["commodity_code"].str[:2]
raw["commodity_chapter_four"] = raw["commodity_code"].str[:4]

In [18]:
raw["us_dollar"] = raw["us_dollar"].str.replace(",", "", regex=False).astype(int)

### Merge the commodity codes with our export data and remove columns we don't need

In [19]:
merge = raw.merge(code_df, left_on="commodity_chapter", right_on="code")

In [20]:
merge.head(1)

Unnamed: 0,registration_place_code,registration_place_name,trade_partner_code,trade_partner_name,commodity_code,commodity_name,first_quantity,first_unit_of_measure,second_quantity,second_unit_of_measure,us_dollar,unnamed:_11,year,commodity_chapter,commodity_chapter_four,code,description,level
0,65,新疆维吾尔自治区,146,吉尔吉斯斯坦,2074500,冻的鸭块及杂碎,864000,千克,0,?,1113600,,2020,2,207,2,Meat and edible meat offal,2


In [21]:
merge.drop(
    [
        "commodity_name",
        "unnamed:_11",
        "code",
    ],
    axis=1,
    inplace=True,
)

In [22]:
merge.rename(columns={"description": "commodity_chapter_desc"}, inplace=True)

In [23]:
merge.head()

Unnamed: 0,registration_place_code,registration_place_name,trade_partner_code,trade_partner_name,commodity_code,first_quantity,first_unit_of_measure,second_quantity,second_unit_of_measure,us_dollar,year,commodity_chapter,commodity_chapter_four,commodity_chapter_desc,level
0,65,新疆维吾尔自治区,146,吉尔吉斯斯坦,2074500,864000,千克,0,?,1113600,2020,2,207,Meat and edible meat offal,2
1,65,新疆维吾尔自治区,146,吉尔吉斯斯坦,2074500,372000,千克,0,?,717456,2021,2,207,Meat and edible meat offal,2
2,65,新疆维吾尔自治区,116,日本,3038990,163180,千克,0,?,453681,2020,3,303,"Fish and crustaceans, molluscs and other aquatic invertebrates",2
3,65,新疆维吾尔自治区,116,日本,3038990,5000,千克,0,?,15718,2021,3,303,"Fish and crustaceans, molluscs and other aquatic invertebrates",2
4,65,新疆维吾尔自治区,330,瑞典,3039100,6000,千克,0,?,150242,2021,3,303,"Fish and crustaceans, molluscs and other aquatic invertebrates",2


### Behold, a dataframe

In [24]:
merged_df = merge[
    [
        "year",
        "registration_place_code",
        "registration_place_name",
        "trade_partner_code",
        "trade_partner_name",
        "commodity_code",
        "commodity_chapter",
        "commodity_chapter_four",
        "commodity_chapter_desc",
        "first_unit_of_measure",
        "second_quantity",
        "second_unit_of_measure",
        "us_dollar",
    ]
].copy()

In [25]:
df = pd.merge(merged_df, countries, on=["trade_partner_code"])

In [26]:
df = pd.merge(df, places, on=["registration_place_code"])

In [27]:
df.head()

Unnamed: 0,year,registration_place_code,registration_place_name_x,trade_partner_code,trade_partner_name_x,commodity_code,commodity_chapter,commodity_chapter_four,commodity_chapter_desc,first_unit_of_measure,second_quantity,second_unit_of_measure,us_dollar,trade_partner_name_y,registration_place_name_y
0,2020,65,新疆维吾尔自治区,146,吉尔吉斯斯坦,2074500,2,207,Meat and edible meat offal,千克,0,?,1113600,Kyrgyzstan,Xingjiang Uygur
1,2021,65,新疆维吾尔自治区,146,吉尔吉斯斯坦,2074500,2,207,Meat and edible meat offal,千克,0,?,717456,Kyrgyzstan,Xingjiang Uygur
2,2020,65,新疆维吾尔自治区,146,吉尔吉斯斯坦,7019000,7,701,Vegetables and certain roots and tubers; edible,千克,0,?,8760,Kyrgyzstan,Xingjiang Uygur
3,2020,65,新疆维吾尔自治区,146,吉尔吉斯斯坦,7020000,7,702,Vegetables and certain roots and tubers; edible,千克,0,?,73121,Kyrgyzstan,Xingjiang Uygur
4,2020,65,新疆维吾尔自治区,146,吉尔吉斯斯坦,7032010,7,703,Vegetables and certain roots and tubers; edible,千克,0,?,3206533,Kyrgyzstan,Xingjiang Uygur


In [28]:
df.drop(
    [
        "registration_place_name_x",
        "trade_partner_name_x",
        "trade_partner_code",
        "registration_place_code",
        "first_unit_of_measure",
        "second_quantity",
        "second_unit_of_measure",
    ],
    axis=1,
    inplace=True,
)

In [29]:
df.rename(
    columns={
        "trade_partner_name_y": "exported_to",
        "registration_place_name_y": "exported_from",
    },
    inplace=True,
)

In [30]:
df.head()

Unnamed: 0,year,commodity_code,commodity_chapter,commodity_chapter_four,commodity_chapter_desc,us_dollar,exported_to,exported_from
0,2020,2074500,2,207,Meat and edible meat offal,1113600,Kyrgyzstan,Xingjiang Uygur
1,2021,2074500,2,207,Meat and edible meat offal,717456,Kyrgyzstan,Xingjiang Uygur
2,2020,7019000,7,701,Vegetables and certain roots and tubers; edible,8760,Kyrgyzstan,Xingjiang Uygur
3,2020,7020000,7,702,Vegetables and certain roots and tubers; edible,73121,Kyrgyzstan,Xingjiang Uygur
4,2020,7032010,7,703,Vegetables and certain roots and tubers; edible,3206533,Kyrgyzstan,Xingjiang Uygur


---

### Export

In [31]:
df.to_csv("../data/processed/exports_xingjiang_all_countries.csv", index=False)