In [1]:
# import sys
# !{sys.executable} -m pip install --upgrade pip
# !{sys.executable} -m pip install --upgrade watermark
# !{sys.executable} -m pip install --upgrade requests

## Constants
### Reference: [EmojiDB](https://emojidb.org/) and [Harmonized System (HS) Products](https://oec.world/en/product-landing/hs#5)

In [2]:
# Chapter starting posiiton; must be in sequence with section id, e.g. any chapter
# starting with 1 and less than 6 has 'I' as the section id
CHAPTER_START = [1, 6, 15, 16, 25, 28, 39, 41, 44, 47, 50, 64, 68, 71, 72, 84, 86, 90, 93, 94, 97, 99]
SECTION_ID = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI',
              'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX', 'XXI', 'XXII']
SECTION_NAME = ['Animal Products', 'Vegetable Products', 'Animal and Vegetable Bi-Products', 
               'Foodstuffs', 'Mineral Products', 'Chemical Products', 'Plastics and Rubbers',
                'Animal Hides', 'Wood Products', 'Paper Goods', 'Textiles', 'Footwear and Headwear',
                'Stone And Glass', 'Precious Metals', 'Metals', 'Machines', 'Transportation',
                'Instruments', 'Weapons', 'Miscellaneous', 'Arts and Antiques', 'Unspecified']
# Emoji
SECTION_EMOJI = ['🐄', '🍅', '🧃', '🌽', '⛏️', '🧪', '♻️', '👜', '🪵', '📖', '👕', '👟', '🪨',
                '💎', '🔩', '⚙️', '🚌', '🎻', '💥', '🎒', '🎭', '🤷']
# Add a dummy value for chapter start - to simplyfy the when searching chapters for a section
CHAPTER_START.append(100)
# HS category code - refers to category in reference data, H0 to H6 and HS
HS_CATEGORY = 'HS'
# Cache file
CACHE_FILE = f'cach/{HS_CATEGORY}.json'
# Comtrade HS category URL
COMTRADE_HS_URL = f'https://comtradeapi.un.org/files/v1/app/reference/{HS_CATEGORY}.json'

## Load data from cache or download

In [3]:
import json
import requests
from pathlib import Path
from typing import Dict, Any

def load_and_cache_json(file_name:str, url:str) -> Dict[str, Any] | None:
    """
    Checks for a local JSON file. If found, loads it.
    If not found, downloads the data from the URL, saves it locally, and then returns the data.

    Args:
        file_name: The name of the local file to check/save.
        url: The URL to download the JSON data from.

    Returns:
        The loaded JSON data as a dictionary, or None if an error occurred.
    """
    local_path = Path(file_name)
    data = None

    # 1. Check for local file
    if local_path.is_file():
        print(f"✅ Found local file: '{file_name}'. Loading from cache...")
        try:
            with open(local_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            print("✨ Successfully loaded data from cache.")
            return data

        except (json.JSONDecodeError, IOError) as e:
            # If the file is corrupted, data remains None, and we proceed to download
            print(f"❌ Error reading/decoding local file: {e}. Attempting download...")

    # Download from URL (if local file was not found, or if corrupted/expired)
    if data is None:
        print(f"🌐 Downloading data from URL: {url}...")
        try:
            response = requests.get(url, timeout=10)
            # Raise an exception for bad status codes (4xx or 5xx)
            response.raise_for_status() 

            downloaded_data = response.json()
            # Cache the downloaded data locally for future use
            with open(local_path, 'w', encoding='utf-8') as f:
                json.dump(downloaded_data, f, indent=4)
            
            print(f"💾 Download successful and saved to '{file_name}'.")
            return downloaded_data

        except requests.exceptions.RequestException as e:
            print(f"❌ Failed to download data from URL: {e}")
        except json.JSONDecodeError as e:
            print(f"❌ Failed to decode JSON from response: {e}")
        except IOError as e:
            print(f"❌ Failed to save file locally: {e}")

    return None

In [4]:
import pandas as pd

# Load if from cache if present or download it
data = load_and_cache_json(file_name=CACHE_FILE, url=COMTRADE_HS_URL)
# We must have the data
assert data != None, '❌ data must be present'

# Create a DF from JSON data
hs_df = pd.DataFrame(data['results'])
hs_df.head(10)

🌐 Downloading data from URL: https://comtradeapi.un.org/files/v1/app/reference/HS.json...
❌ Failed to save file locally: [Errno 2] No such file or directory: 'cach/HS.json'


AssertionError: ❌ data must be present

## Clean up the DF

In [None]:
import re

# Drop the row with id as TOTAL
rows_to_drop = hs_df[hs_df['id'] == 'TOTAL'].index
hs_df.drop(rows_to_drop, inplace=True)

# We don't need the standardUnitAbbr column; drop it
hs_df.drop('standardUnitAbbr', axis=1, inplace=True)

# Pattern to match one or more non-alpha characters at the start
pattern = r'^[^a-zA-Z]+'

# Remove non alpha characters from text; replace with an empty string
hs_df['text'] = hs_df['text'].apply(lambda x: re.sub(pattern, '', x))
hs_df.head()

## Method to get Chapter for a Section

In [None]:
def get_chapter_range_for_section(section:str) -> (int, int):
    try:
        pos = SECTION_ID.index(section)
        return (CHAPTER_START[pos], CHAPTER_START[pos+1] - 1)
    except ValueError as e:
        print(e)
        return (-1, -1)

## Method to get Chapter details

In [None]:
def get_chapters_for_section(section:str, df:pd.DataFrame) -> Dict[str, str] | None:
    """
    Returns chapters for a Section if it is found

    Args:
        section (str): the section name, e.g. I, II etc.
        df (DataFrame): the DF containing HS data

    Returns:
        chapters as a dictionary with id as the key and description as value,
        or None if an error occurred.
    """    
    # Get start and end chapters
    start, end = get_chapter_range_for_section(section=section)
    # check for -1 for invalid chapters
    assert start > 0, '❌ invalid chapter'

    # List of chapters as HS codes
    chapter_hs_codes = []
    
    # Add 1 to end to include the end chapter
    for i in range(start, end+1):
        chapter_hs_codes.append(f'{i:02d}')

    if len(chapter_hs_codes) > 0:
        # Only include the chapters within start and end ranges
        result_df = df[df['id'].isin(chapter_hs_codes)][['id', 'text']]
        return dict(zip(result_df['id'], result_df['text']))
    return None

## Lambda functions to format HS codes

In [None]:
# Format as 12.34
fmt_headings = lambda s: f'{s[:2]}.{s[2:]}'
# Format as 12.34.56
fmt_sub_headings = lambda s: f'{s[:2]}.{s[2:4]}.{s[4:]}'

## Lambda functions to display colours
### Reference: [Print Colors in Python terminal](https://www.geeksforgeeks.org/python/print-colors-python-terminal/)

In [None]:
# Purple for Sections
pr_section = lambda s: print("\033[95m {}\033[00m".format(s))
# Cyan for Chapters
pr_chapter = lambda s: print("\033[96m {}\033[00m".format(s))
# Lighr purple for Headings
pr_heading = lambda s: print("\033[94m {}\033[00m".format(s))
# Green for sub headings
pr_sub_heading = lambda s: print("\033[92m {}\033[00m".format(s))

## Display Section and children details

In [None]:
def display_section_details(section:str, df:pd.DataFrame) -> None:
    # if section is missing, the next statement will throw ValueError
    pos = SECTION_ID.index(section)
    # Get the name and emoji
    section_name = SECTION_NAME[pos]
    section_emoji = SECTION_EMOJI[pos]
    pr_section(f'Section {section} - {section_name} {section_emoji}')
    # Get chapter details
    chapters = get_chapters_for_section(section=section, df=df)
    for key, value in chapters.items():
        pr_chapter(f"{key}: {value}")

In [None]:
display_section_details(section='II', df=hs_df)

## Display Chapter and children details

In [None]:
def display_chapter_details(chapter:str, df:pd.DataFrame) -> None:
    # Chapter headings
    chapter_heading_ser = df[df['id'] == chapter]['text']
    pr_chapter(f'{chapter} - {chapter_heading_ser.iloc[0]}')

    # Chapter headings
    result_df = df.loc[(df['parent'] == chapter) & (df['aggrLevel'] == 4)][['id', 'text']]
    chapter_headings = dict(zip(result_df['id'], result_df['text']))
    for key, value in chapter_headings.items():
        pr_heading(f'{fmt_headings(key)}: {value}')

In [None]:
display_chapter_details(chapter='01', df=hs_df)

In [None]:
display_chapter_details(chapter='16', df=hs_df)

## Display Heading and children details

In [None]:
def display_heading_details(heading:str, df:pd.DataFrame) -> None:
    # Heading HS code + description
    heading_ser = df[df['id'] == heading]['text']
    pr_heading(f'{heading[:2]}.{heading[2:]} - {heading_ser.iloc[0]}')

    # Sub heading details
    result_df = df.loc[(df['parent'] == heading) & (df['aggrLevel'] == 6)][['id', 'text']]
    sub_headings = dict(zip(result_df['id'], result_df['text']))
    for key, value in sub_headings.items():
        pr_sub_heading(f'{fmt_sub_headings(key)} - {value}')

In [None]:
display_heading_details(heading='0101', df=hs_df)

# A Utility method to disply Section hierarchy

In [None]:
def display_section_hierarchy(section:str, df: pd.DataFrame) -> None:
    chapter_indent = 2 * ' '
    heading_indent = 4 * ' '
    sub_heading_indent = 6 * ' '
    
    pos = SECTION_ID.index(section)
    # Get the name and emoji
    section_name = SECTION_NAME[pos]
    section_emoji = SECTION_EMOJI[pos]
    pr_section(f'Section {section} - {section_name} {section_emoji}')
    
    # Dictionary of Chapters
    chapters = get_chapters_for_section(section=section, df=df)
    for chapter, value in chapters.items():
        # Chapter HS code + heading
        chapter_heading_ser = df[df['id'] == chapter]['text']
        pr_chapter(f'{chapter_indent}{chapter} - {chapter_heading_ser.iloc[0]}')
        
        # Dictionary of chapter headings
        result_df = df.loc[(df['parent'] == chapter) & (df['aggrLevel'] == 4)][['id', 'text']]
        chapter_headings = dict(zip(result_df['id'], result_df['text']))
        for heading, value in chapter_headings.items():
            # Heading HS code + description
            heading_ser = df[df['id'] == heading]['text']
            pr_heading(f'{heading_indent}{fmt_headings(heading)} - {heading_ser.iloc[0]}')
            
            # Dictionary of sub headings
            result_df = df.loc[(df['parent'] == heading) & (df['aggrLevel'] == 6)][['id', 'text']]
            sub_headings = dict(zip(result_df['id'], result_df['text']))
            for key, value in sub_headings.items():
                pr_sub_heading(f'{sub_heading_indent}{fmt_sub_headings(key)} - {value}')

In [None]:
display_section_hierarchy(section='V', df=hs_df)

## Display Modules

In [None]:
from watermark import watermark

%load_ext watermark
%watermark --iversions