# Clean data

This notebook cleans the raw data downloaded from [ISED Canada](https://ised-isde.canada.ca/site/trade-data-online/en) and the [HS2](https://en.wikipedia.org/wiki/Harmonized_System) product categorization codes.

In [6]:
from lxml import html
import os
import pandas as pd

In [7]:
data_dir = os.path.abspath('../data')
misc_dir = os.path.abspath('../misc')

In [8]:
src = os.path.join(misc_dir, 'hs.txt')

with open(src, 'r') as f:
    html_text = f.read()
    
doc = html.fromstring(html_text)
texts = doc.xpath('option/text()')

hs = {}
for text in texts:

    if text[0].isnumeric():
        hs[top].append(text)
    else:
        top = text
        hs[text] = []

# clean out empties
hs = {k : v for k,v in hs.items() if v}

In [9]:
res = []
for sec, codes in hs.items():

    for code in codes:
        
        res.append({'sec' : sec, 'code' : code})

codes = pd.DataFrame(res)

dst = os.path.join(data_dir, 'HS2_codes.csv')
codes.to_csv(dst, index=False)
print('Saved:', dst)

Saved: /home/qcx201/Projects/CAtrade/data/HS2_codes.csv


In [None]:
raw_dir = os.path.abspath('../raw')
files = sorted(os.listdir(raw_dir))

# breakdown by province vs trade partner
modes = {
    'part' : {'naArea': '9999', 'countryList': 'DET'},
    'prov' : {'naArea': '9998', 'countryList': 'ALL'},
}

# years
years = [str(x) for x in range(2000, 2025)]

# product codes (2-digit zero-padded)
codes = [str(x).zfill(2) for x in range(1, 100)]
codes = [x for x in codes if x not in ('77', '98', '99')] # remove non-valid codes

dfs = {}

for mode in modes:
    for year in years:
        for code in codes:
            
            file = f'{mode}_{year}_{code}.csv'

            src = os.path.join(raw_dir, file)
            print(src)
            
            tmp = pd.read_csv(src, header=6)
            
            # fill missing countries
            tmp.iloc[:, 0:2] = tmp.iloc[:, 0:2].ffill()
            
            # set and rename index
            tmp = tmp.set_index(['Unnamed: 0', 'Unnamed: 1'])
            tmp.index.names = ['place', 'account']
            
            # rename columns
            tmp.columns = pd.MultiIndex.from_tuples([(year, code)])
            
            # remove rows with empty data
            tmp = tmp.dropna()
            
            if mode not in dfs:
                dfs[mode] = tmp.copy()
                print('> added')
            else:
                dfs[mode] = pd.merge(dfs[mode], tmp, how='outer', left_index=True, right_index=True)
                print('> merged')

FileNotFoundError: [Errno 2] No such file or directory: '/home/qcx201/Projects/CAtrade/src/raw'

Some strange duplications occur in the source data, e.g. for Albania, year 2022, code 84. [[source data link]](https://ised-isde.canada.ca/app/ixb/tdo/runRpt.html?cssIncludes=%2Fcss%2Fcommon.css&cssIncludes=%2Fcss%2Fadd_WET_4-0_Canada_Apps.css&jsIncludes=js%2Futils.js&jsIncludes=js%2FcodeValidation.js&jsIncludes=js%2FdropdownFiltering.js&jsIncludes=js%2FcriteriaFormCodeSearch.js&jsIncludes=js%2FcriteriaFormCountryRegionState.js&jsIncludes=js%2FchangeCriteria.js&jsIncludes=js%2FselectedCodeDescriptions.js&jsIncludes=js%2Fsiteimprove.js&jsIncludes=js%2FgoogleTagManager.js&jsIncludes=js%2FsurveyPopup.js&grouped=INDIVIDUAL&searchType=BL&areaCodes=&naArea=9999&countryList=DET&toFromCountry=CDN&reportType=TB&customYears=2022&periodString=&timePeriod=%7CCustom+Years&currency=CDN&lang=&productType=HS6&hSelectedCodes=%7C84)

See entries in file: /home/qcx201/Projects/CAtrade/raw/part_2022_84.csv

```csv
"Albania","Total Exports","2077773"
"","Total Imports","957064"
"","Trade Balance","1120709"
"","Total Exports","1137478"
"","Total Imports","22765"
"","Trade Balance","1114713"
```

If you search specifically for Albania code 84, you get the first set of entries. [[source data link]](https://ised-isde.canada.ca/app/ixb/tdo/runRpt.html?cssIncludes=%2Fcss%2Fcommon.css&cssIncludes=%2Fcss%2Fadd_WET_4-0_Canada_Apps.css&jsIncludes=js%2Futils.js&jsIncludes=js%2FcodeValidation.js&jsIncludes=js%2FdropdownFiltering.js&jsIncludes=js%2FcriteriaFormCodeSearch.js&jsIncludes=js%2FcriteriaFormCountryRegionState.js&jsIncludes=js%2FchangeCriteria.js&jsIncludes=js%2FselectedCodeDescriptions.js&jsIncludes=js%2Fsiteimprove.js&jsIncludes=js%2FgoogleTagManager.js&jsIncludes=js%2FsurveyPopup.js&grouped=INDIVIDUAL&searchType=BL&areaCodes=242&naArea=9999&countryList=specific&toFromCountry=CDN&reportType=TB&customYears=2022&periodString=&timePeriod=%7CCustom+Years&currency=CDN&lang=&productType=HS6&hSelectedCodes=%7C84)

So I just keep first entries for now and drop duplicates.

In [None]:
for key in dfs:

    df = dfs[key]

    # keep first set of trade balance entries if duplicated
    # see markdown notes above
    isdup = df.index.duplicated(keep='first')
    df = df[~isdup]

    df = df.T

    # stack data (wide format to long)
    cols = [(acct.split('.')[0], cou) for cou, acct in df]
    df.columns = pd.MultiIndex.from_tuples(cols)
    df = df[sorted(df.columns)]

    df = df.stack(level=1, future_stack=True)
    df = df.dropna(how='all', axis=0)

    # rename index and columns
    df.index.names = ('year', 'hs2', 'cou')
    cols = df.columns
    cols = cols.str.lower()
    cols = cols.str.replace(' ', '')
    cols = cols.str.replace('total', '')
    df.columns = cols

    dst = os.path.join(data_dir, f'CA-{key}.csv')
    df.to_csv(dst)
    print('Saved:', dst)
