# Read and process Xinjiang exports

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import numpy as np
import glob

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

### Get commodity codes lookup

In [4]:
# china_codes = pd.read_csv(
#     "../data/raw/Commodity.csv",
#     encoding="GBK",
# )

In [5]:
code_src = pd.read_csv(
    "https://raw.githubusercontent.com/datasets/harmonized-system/master/data/harmonized-system.csv",
    dtype={"level": str},
)

In [6]:
code_src.head(10)

Unnamed: 0,section,hscode,description,parent,level
0,I,1,Animals; live,TOTAL,2
1,I,101,"Horses, asses, mules and hinnies; live",01,4
2,I,10121,"Horses; live, pure-bred breeding animals",0101,6
3,I,10129,"Horses; live, other than pure-bred breeding animals",0101,6
4,I,10130,Asses; live,0101,6
5,I,10190,Mules and hinnies; live,0101,6
6,I,102,Bovine animals; live,01,4
7,I,10221,"Cattle; live, pure-bred breeding animals",0102,6
8,I,10229,"Cattle; live, other than pure-bred breeding animals",0102,6
9,I,10231,"Buffalo; live, pure-bred breeding animals",0102,6


In [7]:
code_df = code_src[(code_src["parent"] == "TOTAL") | (code_src["level"] == "4")][
    ["hscode", "description", "level"]
]

In [8]:
code_df.rename(columns={"hscode": "code"}, inplace=True)

In [9]:
code_df

Unnamed: 0,code,description,level
0,01,Animals; live,2
1,0101,"Horses, asses, mules and hinnies; live",4
6,0102,Bovine animals; live,4
12,0103,Swine; live,4
16,0104,Sheep and goats; live,4
...,...,...,...
6701,9705,"Collections and collectors' pieces; of zoological, botanical, mineralogical, anatomical, historical, archaeological, palaeontological, ethnographic or numismatic interest",4
6703,9706,Antiques; of an age exceeding one hundred years,4
6705,99,Commodities not specified according to kind,2
6706,9999,Commodities not specified according to kind,4


---

### Read all out export CSVs

In [10]:
path = "../data/raw/exports/"
all_files = glob.glob(path + "*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(
        filename,
        encoding="GBK",
        dtype={"数据年月\t": str, "商品编码\t": str},
    )
    li.append(df)

### And concatenate them into one frame

In [11]:
raw = pd.concat(
    li,
    axis=0,
    ignore_index=True,
)

### Clean up column headers

In [12]:
raw.columns = (
    raw.columns.str.replace(r"\t", "", regex=True)
    .str.replace(" ", "_", regex=False)
    .str.lower()
)

### Remove all the tabs in the rows/columns

In [13]:
raw = raw.replace(r"\t\t", "", regex=True)

### Rename colums 

In [14]:
raw.rename(
    columns={
        "数据年月": "date_of_data",
        "商品编码": "commodity_code",
        "商品名称": "commodity_name",
        "贸易伙伴编码": "trade_partner_code",
        "贸易伙伴名称": "trade_partner_name",
        "注册地编码": "registration_place_code",
        "注册地名称": "registration_place_name",
        "第一数量": "first_quantity",
        "第一计量单位": "first_unit_of_measure",
        "第二数量": "second_quantity",
        "第二计量单位": "second_unit_of_measure",
        "美元": "us_dollar",
    },
    inplace=True,
)

### Create columns based on slices of others

In [15]:
raw["date_of_data"].head()

0    202008
1    202009
2    202010
3    202011
4    202012
Name: date_of_data, dtype: object

In [16]:
raw["year"] = raw["date_of_data"].str[:4]
raw["month"] = raw["date_of_data"].str[-2:]

In [17]:
raw["commodity_chapter"] = raw["commodity_code"].str[:2]
raw["commodity_chapter_four"] = raw["commodity_code"].str[:4]

In [18]:
raw["us_dollar"] = raw["us_dollar"].str.replace(",", "", regex=False).astype(int)

### Merge the commodity codes with our export data and remove columns we don't need

In [19]:
merge = raw.merge(code_df, left_on="commodity_chapter", right_on="code")

In [20]:
merge.drop(
    [
        "date_of_data",
        "commodity_name",
        "unnamed:_12",
        "trade_partner_code",
        "trade_partner_name",
        "registration_place_code",
        "registration_place_name",
        "code",
    ],
    axis=1,
    inplace=True,
)

In [21]:
merge.rename(columns={"description": "commodity_chapter_desc"}, inplace=True)

In [22]:
merge_df = pd.merge(
    merge,
    code_df[["code", "description"]],
    left_on="commodity_chapter_four",
    right_on="code",
)

In [23]:
merge_df.head()

Unnamed: 0,commodity_code,first_quantity,first_unit_of_measure,second_quantity,second_unit_of_measure,us_dollar,year,month,commodity_chapter,commodity_chapter_four,commodity_chapter_desc,level,code,description
0,8083020,19264,千克,0,?,30272,2020,8,8,808,"Fruit and nuts, edible; peel of citrus fruit or melons",2,808,"Apples, pears and quinces; fresh"
1,8083020,157437,千克,0,?,292621,2020,9,8,808,"Fruit and nuts, edible; peel of citrus fruit or melons",2,808,"Apples, pears and quinces; fresh"
2,8083020,59269,千克,0,?,114305,2020,10,8,808,"Fruit and nuts, edible; peel of citrus fruit or melons",2,808,"Apples, pears and quinces; fresh"
3,8083020,98042,千克,0,?,205830,2020,11,8,808,"Fruit and nuts, edible; peel of citrus fruit or melons",2,808,"Apples, pears and quinces; fresh"
4,8083020,148456,千克,0,?,318120,2020,12,8,808,"Fruit and nuts, edible; peel of citrus fruit or melons",2,808,"Apples, pears and quinces; fresh"


In [24]:
merge_df.drop(["code", "level"], axis=1, inplace=True)

In [25]:
merge_df.rename(
    columns={
        "description": "commodity_chapter_four_desc",
    },
    inplace=True,
)

### Behold, a dataframe

In [26]:
df = (
    merge_df[
        [
            "year",
            "month",
            "commodity_code",
            "commodity_chapter",
            "commodity_chapter_desc",
            "commodity_chapter_four",
            "commodity_chapter_four_desc",
            "first_unit_of_measure",
            "second_quantity",
            "second_unit_of_measure",
            "us_dollar",
        ]
    ]
    .sort_values(["month", "year"], ascending=False)
    .copy()
)

In [27]:
df.head()

Unnamed: 0,year,month,commodity_code,commodity_chapter,commodity_chapter_desc,commodity_chapter_four,commodity_chapter_four_desc,first_unit_of_measure,second_quantity,second_unit_of_measure,us_dollar
64,2021,12,9042200,9,"Coffee, tea, mate and spices",904,Pepper of the genus piper; dried or crushed or ground fruits of the genus capsicum or of the genus pimenta,千克,0,?,231210
273,2021,12,20079910,20,"Preparations of vegetables, fruit, nuts or other parts of plants",2007,"Jams, fruit jellies, marmalades, fruit or nut puree and fruit or nut pastes, being cooked preparations; whether or not containing added sugar or other sweetening matter",千克,0,?,374682
356,2021,12,21069090,21,Miscellaneous edible preparations,2106,Food preparations not elsewhere specified or included,千克,0,?,576188
400,2021,12,25301020,25,"Salt; sulphur; earths, stone; plastering materials, lime and cement",2530,Mineral substances not elsewhere specified or included,千克,0,?,638100
459,2021,12,28271090,28,"Inorganic chemicals; organic and inorganic compounds of precious metals; of rare earth metals, of radio-active elements and of isotopes",2827,Chlorides; chloride oxides and chloride hydroxides; bromides and bromide oxides; iodides and iodide oxides,千克,0,?,11600


### Export for analysis

In [28]:
df.to_csv("../data/processed/xinjiang_exports_usa_monthly.csv", index=False)