In [None]:
#| default_exp 05_map-amazon-meta-from-dump

In [None]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [None]:
#| export
import json, gzip, argparse, os, pickle, numpy as np, uuid, ssl, scipy.sparse as sp, requests, mimetypes
from timeit import default_timer as timer
from fastdownload import download_url
from multiprocessing import Pool
from functools import partial
from tqdm.auto import tqdm
from pathlib import Path
from PIL import Image

from sugar.core import *

ssl._create_default_https_context = ssl._create_unverified_context

## Load data

In [None]:
#| export
def default_condition(o): 
    return True

def condition_a23(o, ids, key):
    return o[key] in ids

def get_condition(data_dir, dtype, key):
    all_ids = get_all_ids(f'{data_dir}/raw_data', encoding='latin-1')
    
    if dtype == 'a23': 
        return partial(condition_a23, ids=all_ids, key=key)
    elif dtype is None:
        return default_condition
    else: 
        raise ValueError(f'Invalid condition: {dtype}')
        

In [None]:
#| export
def read_jsongz(fname, condition=default_condition):
    with gzip.open(fname, 'rt', encoding='utf-8') as file:
        return [json.loads(o) for o in file if condition(json.loads(o))]

def read_jsonl(fname, condition=default_condition):
    with open(fname, 'r') as file:
        return [json.loads(o) for o in file if condition(json.loads(o))]
        

In [None]:
#| export
def read_file(fname, condition=default_condition):
    if fname.endswith('.jsonl'):
        return read_jsonl(fname, condition)
    elif fname.endswith('.json.gz'):
        return read_jsongz(fname, condition)
    else:
        raise ValueError(f'Invalid file: {fname}')
        

In [None]:
fname = '/home/scai/phd/aiz218323/scratch/datasets/amazon/dumps/raw/meta_categories/meta_All_Beauty.jsonl'
items = read_file(fname)

In [None]:
#| export
def get_items(cache_dir, condition=default_condition):
    items = []
    files = [f'{cache_dir}/{fname}' for fname in os.listdir(cache_dir) if fname.endswith('.jsonl') or fname.endswith('.json.gz')]
    for file in tqdm(files): items.extend(read_file(file, condition))
    return items
    

In [None]:
cache_dir = '/home/scai/phd/aiz218323/scratch/datasets/amazon/dumps/raw/meta_categories/'
items = get_items(cache_dir)

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
#| export
def load_items(cache_dir:str, data_dir:str, key:str, condition_type=None, suffix=''):
    if len(suffix): suffix = f'_{suffix}'
    cache_file = f'{cache_dir}/products{suffix}.pkl' if condition_type is None else f'{cache_dir}/products_{condition_type}{suffix}.pkl'

    try:
        with open(cache_file, 'rb') as file:
            items = pickle.load(file)
    except:
        is_valid = get_condition(data_dir, condition_type, key)
        items = get_items(cache_dir, condition=is_valid)
        
        with open(cache_file, 'wb') as file:
            pickle.dump(items, file)

    return items
    

In [None]:
cache_dir = '/home/scai/phd/aiz218323/scratch/datasets/amazon/dumps/raw/meta_categories/'
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-Amazon-131K/'
items = load_items(cache_dir, data_dir, key='parent_asin', condition_type='a23')

In [None]:
items[3]['images']

[{'thumb': 'https://m.media-amazon.com/images/I/516glRKO6oL._SX38_SY50_CR,0,0,38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/516glRKO6oL.jpg',
  'variant': 'MAIN',
  'hi_res': 'https://m.media-amazon.com/images/I/71mjUMNYXrL._SL1500_.jpg'},
 {'thumb': 'https://m.media-amazon.com/images/I/41HdbnZfhsL._SX38_SY50_CR,0,0,38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/41HdbnZfhsL.jpg',
  'variant': 'PT01',
  'hi_res': 'https://m.media-amazon.com/images/I/61Ikt4caK7L._SL1500_.jpg'},
 {'thumb': 'https://m.media-amazon.com/images/I/41KJfqyZpDL._SX38_SY50_CR,0,0,38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/41KJfqyZpDL.jpg',
  'variant': 'DTLS',
  'hi_res': 'https://m.media-amazon.com/images/I/81i4uKq5hvL._SL1500_.jpg'}]

In [None]:
items[3]['videos']

[{'title': 'Tazo Tea Passion, The Only Tea I Have Ever Enjoyed',
  'url': 'https://www.amazon.com/vdp/0e4f45b1b94c471588e0822f5931d28b?ref=dp_vse_rvc_0',
  'user_id': '/shop/sethgaffar'},
 {'title': 'Tazo Refreshers Herbal Tea & Juice Mix Watermelon Cucumber',
  'url': 'https://www.amazon.com/vdp/0228aed725eb4936a9d3e88ebf136ba5?ref=dp_vse_rvc_1',
  'user_id': ''},
 {'title': 'Tazo Unsweetened Iced Herbal Tea',
  'url': 'https://www.amazon.com/vdp/0bd433867eed4e6b8f75e159c2bd6e35?ref=dp_vse_rvc_2',
  'user_id': ''}]

In [None]:
cache_dir = '/home/scai/phd/aiz218323/scratch/datasets/amazon/dumps/raw/meta_categories/'
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/(mapped)LF-AmazonTitles-1.3M/'

In [None]:
items = load_items(cache_dir, data_dir, key='parent_asin', condition_type='a23')

In [None]:
len([o['title'] for o in items if 'videos' in o and len(o['videos'])])/len(items)

0.10261405006310782

## Extract metadata

In [None]:
#| export
def title_proc(o): 
    return [o['title']]
    
def description_proc(o): 
    return o['description']
    
def details_proc(o): 
    return [o['details'].__str__()]

def image_proc(o):
    images = list()
    for image in o['images']:
        for dtype in ['large', 'hi_res', 'thumb']:
            if dtype in image and image[dtype] is not None:
                images.append(image[dtype]); break
    assert len(images) == len(o['images']), f"Image not found: {o['title']}"
    return images

def video_proc(o):
    return [video['url'] for video in o['videos'] if video['url'] is not None]
    

In [None]:
video_proc(items[0])

['https://www.amazon.com/vdp/033b0c4dbea64c59b8fbb93c2d94e504?ref=dp_vse_rvc_0',
 'https://www.amazon.com/vdp/8ef0ed79ace64a7b8422f3ae60f76727?ref=dp_vse_rvc_1',
 'https://www.amazon.com/vdp/0bcbc25473ea445c992efe44599fb1a7?ref=dp_vse_rvc_2',
 'https://www.amazon.com/vdp/0a9cdcc7a6d84c72aa58f7f2227becac?ref=dp_vse_rvc_3',
 'https://www.amazon.com/vdp/0acb13edc3734880900532e6765006e1?ref=dp_vse_rvc_4',
 'https://www.amazon.com/vdp/5d27e943d7784dec963890e276d66f38?ref=dp_vse_rvc_5']

In [None]:
#| export
METADATA_PROCS = {
    'title': title_proc, 
    'details': details_proc, 
    'images': image_proc,
    'videos': video_proc,
    'description': description_proc, 
}

In [None]:
#| export
def get_meta_proc(dtype):
    assert dtype in METADATA_PROCS, f'Invalid metadata processing function: {dtype}.'
    return METADATA_PROCS[dtype]

def extract_meta_info(items, dtype, key):
    func = get_meta_proc(dtype)
    return {o[key]: func(o) for o in items}
    

In [None]:
meta_mapping = extract_meta_info(items, 'videos', 'parent_asin')

In [None]:
#| export
def get_vocabulary(mapping):
    vocab, mapping_item2idx = create_vocab_and_item2idx(mapping)

    vocab_txt = sorted(vocab, key=lambda x: vocab[x])
    vocab_ids = list(range(len(vocab_txt)))

    return vocab_ids, vocab_txt, mapping_item2idx
    

In [None]:
#| export
def get_ids(data_dir):
    trn_ids, _ = load_raw_txt(f'{data_dir}/raw_data/train.raw.txt', encoding='latin-1')
    tst_ids, _ = load_raw_txt(f'{data_dir}/raw_data/test.raw.txt', encoding='latin-1')
    lbl_ids, _ = load_raw_txt(f'{data_dir}/raw_data/label.raw.txt', encoding='latin-1')
    return trn_ids, tst_ids, lbl_ids
    

In [None]:
#| export
def get_matrix(mapping_item2idx, vocab_size, trn_ids, tst_ids, lbl_ids):
    trn_mat, trn_ids = get_matrix_from_item2idx(mapping_item2idx, vocab_size, trn_ids)
    tst_mat, tst_ids = get_matrix_from_item2idx(mapping_item2idx, vocab_size, tst_ids)
    lbl_mat, lbl_ids = get_matrix_from_item2idx(mapping_item2idx, vocab_size, lbl_ids)
    return trn_mat, tst_mat, lbl_mat
    

In [None]:
#| export
def filter_vocab(vocab_ids, vocab_txt, trn_mat, tst_mat, lbl_mat=None):
    valid_idx = np.where(trn_mat.getnnz(axis=0) > 0)[0]
    if lbl_mat is not None:
        lbl_idx = np.where(lbl_mat.getnnz(axis=0) > 0)[0]
        valid_idx = np.union1d(valid_idx, lbl_idx)

    trn_mat = trn_mat[:, valid_idx]
    tst_mat = tst_mat[:, valid_idx]
    if lbl_mat is not None: 
        lbl_mat = lbl_mat[:, valid_idx]
    
    vocab_ids, vocab_txt = [vocab_ids[i] for i in valid_idx], [vocab_txt[i] for i in valid_idx]

    return vocab_ids, vocab_txt, trn_mat, tst_mat, lbl_mat
    

In [None]:
metadata_ids, metadata_txt, mapping_item2idx = get_vocabulary(meta_mapping)

  0%|          | 0/2056005 [00:00<?, ?it/s]

## Download image

In [None]:
#| export
def get_downloaded_image_filename(dest, name, suffix):
    start_index = 1
    candidate_name = name

    while (dest/f"{candidate_name}{suffix}").is_file():
        candidate_name = f"{candidate_name}{start_index}"
        start_index += 1

    return candidate_name
    

In [None]:
#| export
def download_image(dest, inp, timeout=4, preserve_filename=False):
    i,url = inp
    url = url.split("?")[0]
    url_path = Path(url)
    suffix = url_path.suffix if url_path.suffix else '.jpg'
    name = get_downloaded_image_filename(dest, url_path.stem, suffix) if preserve_filename else str(uuid.uuid4())
    try: 
        dest = dest/f"{name}{suffix}"
        download_url(url, dest, show_progress=False, timeout=timeout)
        return dest
    except Exception as e: 
        f"Couldn't download {url}."
        return None
        

In [None]:
#| export
def download_images(dest, urls=None, n_workers=8, timeout=4, preserve_filename=False):
    dest = Path(dest)
    dest.mkdir(exist_ok=True)
    with Pool(processes=n_workers) as pool:
        fnames = list(tqdm(pool.imap(partial(download_image, dest, timeout=timeout, preserve_filename=preserve_filename), list(enumerate(urls))), total=len(urls)))
    return fnames
    

In [None]:
#| export
def verify_image(fn):
    try:
        im = Image.open(fn)
        im.draft(im.mode, (32,32))
        im.load()
        return True
    except: return False
        

In [None]:
#| export
def verify_images(fns, n_workers=8):
    with Pool(processes=n_workers) as pool:
        return list(tqdm(pool.imap(verify_image, fns), total=len(fns)))
        

In [None]:
#| export
def remove_images(fnames, is_valid):
    for fname,v in zip(fnames, is_valid):
        if not v and fname is not None: fname.unlink()
    

In [None]:
#| export
def filter_images(image_ids, image_txt, trn_mat, tst_mat, lbl_mat, valid_idx):
    trn_mat = trn_mat[:, valid_idx]
    tst_mat = tst_mat[:, valid_idx]
    lbl_mat = lbl_mat[:, valid_idx]
    
    image_ids, image_txt = [image_ids[i] for i in valid_idx], [image_txt[i] for i in valid_idx]

    return image_ids, image_txt, trn_mat, tst_mat, lbl_mat
    

In [None]:
#| export
def process_images(save_dir, metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat):
    fnames = download_images(save_dir, urls=metadata_txt)
    
    is_valid = verify_images(fnames)
    remove_images(fnames, is_valid)

    metadata_txt = [o if o is None else o.name for o in fnames]
    return filter_images(metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat, np.where(is_valid)[0])


In [None]:
image_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-Amazon-131K/images'

In [None]:
fnames = download_images(image_dir, urls=metadata_txt)

  0%|          | 0/15722 [00:00<?, ?it/s]

In [None]:
is_valid = verify_images(fnames)
remove_images(fnames, is_valid)

  0%|          | 0/15722 [00:00<?, ?it/s]

In [None]:
metadata_txt = [o if o is None else o.name for o in fnames]

## Download video

In [None]:
#| export
def get_downloaded_video_filename(dest, name, suffix):
    start_index = 1
    candidate_name = name

    while (dest/f"{candidate_name}{suffix}").is_file():
        candidate_name = f"{candidate_name}{start_index}"
        start_index += 1

    return candidate_name
    

In [None]:
#| export
def download_video(dest, url, preserve_filename=False):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            url_path = Path(url.split('/')[-1].split('?')[0])
            if not url_path.suffix:
                content_type = response.headers.get('Content-Type')
                suffix = mimetypes.guess_extension(content_type) if content_type else '.mp4'
                if not suffix: suffix = '.mp4'
            
            name = get_downloaded_video_filename(dest, url_path.stem, suffix) if preserve_filename else str(uuid.uuid4())

            dest = dest/f"{name}{suffix}"
            with open(dest, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk: f.write(chunk)
            return dest
        else:
            print(f"Failed to download {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred while downloading {url}: {e}")


In [None]:
save_dir = f'{data_dir}/videos'

os.makedirs(save_dir, exist_ok=True)
download_video(Path(save_dir), metadata_txt[0], preserve_filename=False)

In [None]:
#| export
def download_videos(dest, urls=None, n_workers=8, timeout=4, preserve_filename=False):
    dest = Path(dest)
    dest.mkdir(exist_ok=True)
    with Pool(processes=n_workers) as pool:
        fnames = list(tqdm(pool.imap(partial(download_video, dest, preserve_filename=preserve_filename), urls), total=len(urls)))
    return fnames
    

In [None]:
fnames = download_videos(f'{data_dir}/videos', urls=metadata_txt)

  0%|          | 0/663584 [00:00<?, ?it/s]

Failed to download https://www.amazon.com/vdp/0bf33426744b4e32ac1c6ae62076ae2c?ref=dp_vse_rvc_6. Status code: 404
Failed to download https://www.amazon.com/vdp/0974d43c5a6e483584a8a4901daf3d7b?ref=dp_vse_rvc_1. Status code: 404
Failed to download https://www.amazon.com/vdp/05917ed7a0b74609883034c058f76d08?ref=dp_vse_rvc_0. Status code: 404
Failed to download https://www.amazon.com/vdp/02c528ea20f645248d3e59a139991f90?ref=dp_vse_rvc_0. Status code: 404
Failed to download https://www.amazon.com/vdp/0dfe16a62414499683f3fd6d2cc73a80?ref=dp_vse_rvc_3. Status code: 404
Failed to download https://www.amazon.com/vdp/058b0e0d1ced41819809c0ec4ff4d2bd?ref=dp_vse_rvc_0. Status code: 404
Failed to download https://www.amazon.com/vdp/021d877e7fac48e7b403caa713d9553c?ref=dp_vse_rvc_1. Status code: 404
Failed to download https://www.amazon.com/vdp/6e4bba792d5849f183f96e806d457d57?ref=dp_vse_rvc_2. Status code: 404
Failed to download https://www.amazon.com/vdp/05b60ffc03c84d5ea0d023d8f7bced12?ref=dp_vs

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Failed to download https://www.amazon.com/vdp/036e9a1a79b1425ca34d5db00a3421d0?ref=dp_vse_rvc_0. Status code: 404
Failed to download https://www.amazon.com/vdp/00c4fbf983994f929a30b9b4bd61e066?ref=dp_vse_rvc_3. Status code: 404
Failed to download https://www.amazon.com/vdp/07ada92e039e49e68ee9cb16c3b2f02b?ref=dp_vse_rvc_0. Status code: 404
Failed to download https://www.amazon.com/vdp/0269bd3948714d488f01a2e545241fb6?ref=dp_vse_rvc_3. Status code: 404
Failed to download https://www.amazon.com/vdp/08baba5d7e8a497095f05d8a246ed751?ref=dp_vse_rvc_0. Status code: 404
Failed to download https://www.amazon.com/vdp/0d5cbd265aa647e0a2e628df638b0907?ref=dp_vse_rvc_1. Status code: 404
Failed to download https://www.amazon.com/vdp/0b64436a380d4796a6fa80766cc957ad?ref=dp_vse_rvc_0. Status code: 404
Failed to download https://www.amazon.com/vdp/013b3dfcac9d483797b9df7616832694?ref=dp_vse_rvc_4. Status code: 503
Failed to download https://www.amazon.com/vdp/0af27ebe16fa4fb1af22949470b4e0cb?ref=dp_vs

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Failed to download https://www.amazon.com/vdp/0b4bcad99f7047398e76baac1640534d?ref=dp_vse_rvc_4. Status code: 404
Failed to download https://www.amazon.com/vdp/071bd55e5b504c3aafae8cef15800eff?ref=dp_vse_rvc_4. Status code: 404
Failed to download https://www.amazon.com/vdp/01d8e588fdf64d3e9a53b12c38d5ad44?ref=dp_vse_rvc_1. Status code: 404
Failed to download https://www.amazon.com/vdp/0c0527d5cee14f33814a759916d33f9e?ref=dp_vse_rvc_3. Status code: 404
Failed to download https://www.amazon.com/vdp/c7aee2d750584750b60d31497e0de375?ref=dp_vse_rvc_0. Status code: 404
Failed to download https://www.amazon.com/vdp/06065d80f2d54a86a747f304f3dbd9e1?ref=dp_vse_rvc_0. Status code: 404
Failed to download https://www.amazon.com/vdp/0ba09b44e2af4c2f84378d1a4b34f2d9?ref=dp_vse_rvc_3. Status code: 404
Failed to download https://www.amazon.com/vdp/025fc7d0a50c42c4bcb151f4424a4771?ref=dp_vse_rvc_3. Status code: 404
Failed to download https://www.amazon.com/vdp/5f70079e139f49b288c44c4cd593cb6f?ref=dp_vs

Process ForkPoolWorker-48:
Process ForkPoolWorker-42:
Process ForkPoolWorker-46:
Process ForkPoolWorker-44:
Process ForkPoolWorker-43:
Process ForkPoolWorker-45:
Process ForkPoolWorker-47:
Process ForkPoolWorker-41:

KeyboardInterrupt



In [None]:
#| export
def process_videos(metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat):
    #TODO    
    
    is_valid = verify_images(fnames)
    remove_images(fnames, is_valid)

    metadata_txt = [o if o is None else o.name for o in fnames]
    return filter_images(metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat, np.where(is_valid)[0])


In [None]:
video_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-Amazon-131K/videos'

In [None]:
metadata_txt[:5]

['https://www.amazon.com/vdp/0354b17a71a44097930f28168b8aee8f?ref=dp_vse_rvc_0',
 'https://www.amazon.com/vdp/0e4f45b1b94c471588e0822f5931d28b?ref=dp_vse_rvc_0',
 'https://www.amazon.com/vdp/0228aed725eb4936a9d3e88ebf136ba5?ref=dp_vse_rvc_1',
 'https://www.amazon.com/vdp/0bd433867eed4e6b8f75e159c2bd6e35?ref=dp_vse_rvc_2',
 'https://www.amazon.com/vdp/7615c3bde1d54f1abaeeff2ff13be5d8?ref=dp_vse_rvc_0']

In [None]:
fnames = download_videos(video_dir, urls=metadata_txt)

  0%|          | 0/15722 [00:00<?, ?it/s]

In [None]:
is_valid = verify_videos(fnames)
remove_videos(fnames, is_valid)

  0%|          | 0/15722 [00:00<?, ?it/s]

In [None]:
metadata_txt = [o if o is None else o.name for o in fnames]

## Construct matrix

In [None]:
#| export
def get_metadata(cache_dir, data_dir, meta_type, key, condition_type, do_filter=True):
    items = load_items(cache_dir, data_dir, key, condition_type)
    
    meta_mapping = extract_meta_info(items, meta_type, key)

    metadata_ids, metadata_txt, mapping_item2idx = get_vocabulary(meta_mapping)
    trn_ids, tst_ids, lbl_ids = get_ids(data_dir)
    trn_mat, tst_mat, lbl_mat = get_matrix(mapping_item2idx, len(metadata_ids), trn_ids, tst_ids, lbl_ids)

    if key == 'images':
        process_images(f'{data_dir}/images', metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat)
    elif key == 'videos':
        raise NotImplementedError('TODO')

    if do_filter:
        metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat = filter_vocab(metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat)
        
    return trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt
    

In [None]:
#| export
def save_metadata(save_dir, trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt, metadata_type):
    sp.save_npz(f'{save_dir}/{metadata_type}_trn_X_Y.npz', trn_mat)
    sp.save_npz(f'{save_dir}/{metadata_type}_tst_X_Y.npz', tst_mat)
    sp.save_npz(f'{save_dir}/{metadata_type}_lbl_X_Y.npz', lbl_mat)
    
    os.makedirs(f'{save_dir}/raw_data', exist_ok=True)
    save_raw_txt(f'{save_dir}/raw_data/{metadata_type}.raw.txt', metadata_ids, metadata_txt)
    

In [None]:
cache_dir = '/home/scai/phd/aiz218323/scratch/datasets/amazon/dumps/raw/meta_categories/'
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-Amazon-131K/'

In [None]:
trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt = get_metadata(cache_dir, data_dir, meta_type='images', key='parent_asin', 
                                                                     condition_type='a23', do_filtering=True)

  0%|          | 0/5622 [00:00<?, ?it/s]

  0%|          | 0/294805 [00:00<?, ?it/s]

  0%|          | 0/134835 [00:00<?, ?it/s]

  0%|          | 0/131073 [00:00<?, ?it/s]

In [None]:
save_metadata(data_dir, trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt, metadata_type)

## `__main__`

In [None]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cache_dir', type=str, required=True)
    parser.add_argument('--data_dir', type=str, required=True)
    parser.add_argument('--key', type=str, default='parent_asin')
    parser.add_argument('--condition_type', type=str, default=None)
    parser.add_argument('--metadata_type', type=str, required=True)
    parser.add_argument('--no_filter', action='store_false')
    return parser.parse_args()


In [None]:
#| export

if __name__ == '__main__':
    start_time = timer()

    args = parse_args()

    trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt = get_metadata(args.cache_dir, args.data_dir, meta_type=args.metadata_type, 
                                                                         key=args.key, condition_type=args.condition_type, 
                                                                         do_filter=args.no_filter)
    save_metadata(args.data_dir, trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt, args.metadata_type)
    
    end_time = timer()
    print(f'Time elapsed: {end_time-start_time:.2f} seconds.')
    