In [None]:
#| default_exp 17_map-amazon-reviews-from-dump

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import scipy.sparse as sp, argparse, numpy as np

from tqdm.auto import tqdm
from pathlib import Path
from timeit import default_timer as timer

from sugar.map_amazon_dump import *

## Load data

In [None]:
cache_dir = '/home/scai/phd/aiz218323/scratch/datasets/amazon/dumps/raw/review_categories/'
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-Amazon-131K/'
items = load_items(cache_dir, data_dir, key='parent_asin', condition_type='a23')

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
items[0]

{'rating': 3.0,
 'title': 'Delicious but too expensive',
 'text': 'It’s candy, I eat it for a treat',
 'images': [],
 'asin': 'B000IXWDFO',
 'parent_asin': 'B000IXWDFO',
 'user_id': 'AFNT6ZJCYQN3WDIKUSWHJDXNND2Q',
 'timestamp': 1676512805768,
 'helpful_vote': 0,
 'verified_purchase': True}

## Extract reviews

In [None]:
#| export
def title_proc(o): 
    return (o['title'], o['rating'])
    
def text_proc(o): 
    return (o['text'], o['rating'])

def title_text_proc(o):
    return (o['title'] + ' ' + o['text'], o['rating'])
    

In [None]:
#| export
REVIEW_PROCS = {
    'title': title_proc, 
    'text': text_proc, 
    'title_text': title_text_proc,
}

In [None]:
#| export
def get_review_proc(dtype):
    assert dtype in REVIEW_PROCS, f'Invalid review processing function: {dtype}.'
    return REVIEW_PROCS[dtype]

def extract_review_info(items, dtype, key):
    func = get_review_proc(dtype)
    reviews = dict()
    for o in tqdm(items, total=len(items)): reviews.setdefault(o[key], []).append(func(o))
    return reviews
    

In [None]:
review_mapping = extract_review_info(items, 'text', 'parent_asin')

  0%|          | 0/221229 [00:00<?, ?it/s]

In [None]:
review_mapping['B000IXWDFO']

[('It’s candy, I eat it for a treat', 3.0),
 ('Love Hersheys milk chocolate', 5.0),
 ('Bought a so called Giant Hershey bar a week ago, the first giant I bought in awhile.<br />  I still can\'t believe how much they shrunk their bars down "again".<br /> Its as thin as a reg Hershey bar now and it will soon be as thin as todays Halloween mini\'s that also was several times thicker in size.<br />  Hershey bars today are nowhere near the size they once were. Their size is nothing but joke for what they cost us now. I told my wife to never buy any Hersheys again!  Other brands offer a much better deal, taste just as good or better & Im happy to see stores offering so many more brands today.<br /> These simply cost too much being such small portions today & now they discovered metal inside them?  You must be chitting me, is this to add more weight so we get even less chocolate? Hey it is sold by weight?..lol<br />  Oh well\' most our family was done with the Hershey brand when most their ca

In [None]:
#| export
def create_vocab_and_item2idx(mapping):
    mapping_item2idx, vocab = dict(), dict()
    for k,v in tqdm(mapping.items()):
        for o in v:
            idx = vocab.setdefault(o[0], len(vocab))
            l = mapping_item2idx.setdefault(k, [])
            l.append((idx,o[1]))
    return vocab, mapping_item2idx
    

In [None]:
#| export
def get_vocabulary(mapping):
    vocab, mapping_item2idx = create_vocab_and_item2idx(mapping)

    vocab_txt = sorted(vocab, key=lambda x: vocab[x])
    vocab_ids = list(range(len(vocab_txt)))

    return vocab_ids, vocab_txt, mapping_item2idx
    

In [None]:
vocab_ids, vocab_txt, mapping_item2idx = get_vocabulary(review_mapping)

  0%|          | 0/4418 [00:00<?, ?it/s]

In [None]:
list(zip(*mapping_item2idx['B000IXWDFO']))

[(0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66),
 (3.0,
  5.0,
  1.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  3.0,
  3.0,
  1.0,
  5.0,
  5.0,
  2.0,
  3.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  1.0,
  5.0,
  4.0,
  5.0,
  5.0,
  4.0,
  2.0,
  5.0,
  1.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  4.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  4.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  4.0,
  5.0,
  5.0,
  1.0,
  5.0,
  5.0,
  5.0)]

## Construct matrix

In [None]:
#| export
def get_matrix_from_item2idx(mapping, vocab_size, ids=None):
    data, indices, indptr = [], [], [0]
    ids = list(mapping) if ids is None else ids
    for i in tqdm(ids):
        if i in mapping:
            item_idx, item_val = list(zip(*mapping[i]))
            data.extend(item_val)
            indices.extend(item_idx)
        indptr.append(len(data))
    mat = sp.csr_matrix((data, indices, indptr), shape=(len(ids), vocab_size), dtype=np.float32)
    mat.sort_indices()
    mat.sum_duplicates()
    return mat, ids
    

In [None]:
#| export
def get_matrix(mapping_item2idx, vocab_size, trn_ids, tst_ids, lbl_ids):
    trn_mat, trn_ids = get_matrix_from_item2idx(mapping_item2idx, vocab_size, trn_ids)
    tst_mat, tst_ids = get_matrix_from_item2idx(mapping_item2idx, vocab_size, tst_ids)
    lbl_mat, lbl_ids = get_matrix_from_item2idx(mapping_item2idx, vocab_size, lbl_ids)
    return trn_mat, tst_mat, lbl_mat
    

In [None]:
#| export
def get_metadata(cache_dir, data_dir, meta_type, key, condition_type, do_filter=True):
    items = load_items(cache_dir, data_dir, key, condition_type, Path(data_dir).stem)
    
    review_mapping = extract_review_info(items, meta_type, key)

    metadata_ids, metadata_txt, mapping_item2idx = get_vocabulary(review_mapping)
    trn_ids, tst_ids, lbl_ids = get_ids(data_dir)
    trn_mat, tst_mat, lbl_mat = get_matrix(mapping_item2idx, len(metadata_ids), trn_ids, tst_ids, lbl_ids)

    if do_filter:
        metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat = filter_vocab(metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat)
        
    return trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt
    

## `__main__`

In [None]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cache_dir', type=str, required=True)
    parser.add_argument('--data_dir', type=str, required=True)
    parser.add_argument('--key', type=str, default='parent_asin')
    parser.add_argument('--condition_type', type=str, default=None)
    parser.add_argument('--review_type', type=str, required=True)
    parser.add_argument('--no_filter', action='store_false')
    return parser.parse_args()
    

In [None]:
#| export
if __name__ == '__main__':
    start_time = timer()

    args = parse_args()

    trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt = get_metadata(args.cache_dir, args.data_dir, meta_type=args.review_type, 
                                                                         key=args.key, condition_type=args.condition_type, 
                                                                         do_filter=args.no_filter)
    save_metadata(args.data_dir, trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt, f'review_{args.review_type}')
    
    end_time = timer()
    print(f'Time elapsed: {end_time-start_time:.2f} seconds.')
    