In [None]:
#| default_exp 17_map-amazon-reviews-from-dump

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import scipy.sparse as sp, argparse, numpy as np

from tqdm.auto import tqdm
from pathlib import Path
from timeit import default_timer as timer

from sugar.map_amazon_dump import *

## Load data

In [None]:
cache_dir = '/home/scai/phd/aiz218323/scratch/datasets/amazon/dumps/raw/review_categories/'
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-Amazon-131K/'
items = load_items(cache_dir, data_dir, key='parent_asin', condition_type='a23')

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
items[0]

{'rating': 3.0,
 'title': 'Delicious but too expensive',
 'text': 'It’s candy, I eat it for a treat',
 'images': [],
 'asin': 'B000IXWDFO',
 'parent_asin': 'B000IXWDFO',
 'user_id': 'AFNT6ZJCYQN3WDIKUSWHJDXNND2Q',
 'timestamp': 1676512805768,
 'helpful_vote': 0,
 'verified_purchase': True}

## Extract reviews

In [None]:
#| export
def title_proc(o): 
    return (o['title'], o['rating'])
    
def text_proc(o): 
    return (o['text'], o['rating'])

def title_text_proc(o):
    return (o['title'] + ' ' + o['text'], o['rating'])
    

In [None]:
#| export
REVIEW_PROCS = {
    'title': title_proc, 
    'text': text_proc, 
    'title_text': title_text_proc,
}

In [None]:
#| export
def get_review_proc(dtype):
    assert dtype in REVIEW_PROCS, f'Invalid review processing function: {dtype}.'
    return REVIEW_PROCS[dtype]

def extract_review_info(items, dtype, key):
    func = get_review_proc(dtype)
    reviews = dict()
    for o in tqdm(items, total=len(items)): reviews.setdefault(o[key], []).append(func(o))
    return reviews
    

In [None]:
review_mapping = extract_review_info(items, 'text', 'parent_asin')

  0%|          | 0/221229 [00:00<?, ?it/s]

In [None]:
review_mapping['B000IXWDFO']

[('It’s candy, I eat it for a treat', 3.0),
 ('Love Hersheys milk chocolate', 5.0),
 ('Bought a so called Giant Hershey bar a week ago, the first giant I bought in awhile.<br />  I still can\'t believe how much they shrunk their bars down "again".<br /> Its as thin as a reg Hershey bar now and it will soon be as thin as todays Halloween mini\'s that also was several times thicker in size.<br />  Hershey bars today are nowhere near the size they once were. Their size is nothing but joke for what they cost us now. I told my wife to never buy any Hersheys again!  Other brands offer a much better deal, taste just as good or better & Im happy to see stores offering so many more brands today.<br /> These simply cost too much being such small portions today & now they discovered metal inside them?  You must be chitting me, is this to add more weight so we get even less chocolate? Hey it is sold by weight?..lol<br />  Oh well\' most our family was done with the Hershey brand when most their ca

In [None]:
#| export
def create_vocab_and_item2idx(mapping):
    mapping_item2idx, vocab = dict(), dict()
    for k,v in tqdm(mapping.items()):
        for o in v:
            idx = vocab.setdefault(o[0], len(vocab))
            l = mapping_item2idx.setdefault(k, [])
            l.append((idx,o[1]))
    return vocab, mapping_item2idx
    

In [None]:
#| export
def get_vocabulary(mapping):
    vocab, mapping_item2idx = create_vocab_and_item2idx(mapping)

    vocab_txt = sorted(vocab, key=lambda x: vocab[x])
    vocab_ids = list(range(len(vocab_txt)))

    return vocab_ids, vocab_txt, mapping_item2idx
    

In [None]:
vocab_ids, vocab_txt, mapping_item2idx = get_vocabulary(review_mapping)

  0%|          | 0/4418 [00:00<?, ?it/s]

In [None]:
list(zip(*mapping_item2idx['B000IXWDFO']))

[(0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66),
 (3.0,
  5.0,
  1.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  3.0,
  3.0,
  1.0,
  5.0,
  5.0,
  2.0,
  3.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  1.0,
  5.0,
  4.0,
  5.0,
  5.0,
  4.0,
  2.0,
  5.0,
  1.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  4.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  4.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  4.0,
  5.0,
  5.0,
  1.0,
  5.0,
  5.0,
  5.0)]

## Construct matrix

In [None]:
#| export
def get_matrix_from_item2idx(mapping, vocab_size, ids=None):
    data, indices, indptr = [], [], [0]
    ids = list(mapping) if ids is None else ids
    for i in tqdm(ids):
        if i in mapping:
            item_idx, item_val = list(zip(*mapping[i]))
            data.extend(item_val)
            indices.extend(item_idx)
        indptr.append(len(data))
    mat = sp.csr_matrix((data, indices, indptr), shape=(len(ids), vocab_size), dtype=np.float32)
    mat.sort_indices()
    mat.sum_duplicates()
    return mat, ids
    

In [None]:
#| export
def get_matrix(mapping_item2idx, vocab_size, trn_ids, tst_ids, lbl_ids):
    trn_mat, trn_ids = get_matrix_from_item2idx(mapping_item2idx, vocab_size, trn_ids)
    tst_mat, tst_ids = get_matrix_from_item2idx(mapping_item2idx, vocab_size, tst_ids)
    lbl_mat, lbl_ids = get_matrix_from_item2idx(mapping_item2idx, vocab_size, lbl_ids)
    return trn_mat, tst_mat, lbl_mat
    

In [None]:
#| export
def get_metadata(cache_dir, data_dir, meta_type, key, condition_type, do_filter=True):
    items = load_items(cache_dir, data_dir, key, condition_type, Path(data_dir).stem)
    
    review_mapping = extract_review_info(items, meta_type, key)

    metadata_ids, metadata_txt, mapping_item2idx = get_vocabulary(review_mapping)
    trn_ids, tst_ids, lbl_ids = get_ids(data_dir)
    trn_mat, tst_mat, lbl_mat = get_matrix(mapping_item2idx, len(metadata_ids), trn_ids, tst_ids, lbl_ids)

    if do_filter:
        metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat = filter_vocab(metadata_ids, metadata_txt, trn_mat, tst_mat, lbl_mat)
        
    return trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt
    

## `__main__`

In [None]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cache_dir', type=str, required=True)
    parser.add_argument('--data_dir', type=str, required=True)
    parser.add_argument('--key', type=str, default='parent_asin')
    parser.add_argument('--condition_type', type=str, default=None)
    parser.add_argument('--review_type', type=str, required=True)
    parser.add_argument('--no_filter', action='store_false')
    return parser.parse_args()
    

In [None]:
#| export
if __name__ == '__main__':
    start_time = timer()

    args = parse_args()

    trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt = get_metadata(args.cache_dir, args.data_dir, meta_type=args.review_type, 
                                                                         key=args.key, condition_type=args.condition_type, 
                                                                         do_filter=args.no_filter)
    save_metadata(args.data_dir, trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt, f'review_{args.review_type}')
    
    end_time = timer()
    print(f'Time elapsed: {end_time-start_time:.2f} seconds.')
    

## `LF-AmazonTitles-1.3M`

In [None]:
import pandas as pd

from tqdm.auto import tqdm
from sugar.core import *
from xclib.utils.sparse import retain_topk

In [None]:
data_dir = "/home/scai/phd/aiz218323/scratch/datasets/benchmarks/(mapped)LF-AmazonTitles-1.3M/"

In [None]:
reviews = load_raw_file(f'{data_dir}/raw_data/review_title_text.raw.csv')

In [None]:
descriptions = load_raw_file(f'{data_dir}/raw_data/description.raw.csv')

In [None]:
reviews.fillna('', inplace=True)

In [None]:
descriptions.fillna('', inplace=True)

In [None]:
def get_text_with_top_review(review_mat_file, description_mat_file, title_file):
    review_mat = retain_topk(sp.load_npz(review_mat_file), k=3)
    description_mat = sp.load_npz(description_mat_file)
    
    ids, title = load_raw_file(title_file)

    def collate_metadata(mat, texts):
        data = []
        for row in tqdm(mat, total=mat.shape[0]):
            idx = np.argsort(row.data)[::-1]
            sorted_texts = ' || '.join([texts['text'].iloc[i] for i in row.indices[idx]])
            data.append(sorted_texts)
        return data

    data_reviews = collate_metadata(review_mat, reviews)
    data_descriptions = collate_metadata(description_mat, descriptions)

    df = pd.DataFrame({'identifier':ids, 'title':title, 'description':data_descriptions, 'reviews':data_reviews})
    return df
    

### Train

In [None]:
review_mat_file = f'{data_dir}/review_title_text_trn_X_Y.npz'
description_mat_file = f'{data_dir}/description_trn_X_Y.npz'
title_file = f'{data_dir}/raw_data/train.raw.txt'

trn_df = get_text_with_top_review(review_mat_file, description_mat_file, title_file)

  0%|          | 0/2248619 [00:00<?, ?it/s]

  0%|          | 0/2248619 [00:00<?, ?it/s]

In [None]:
trn_df

Unnamed: 0,identifier,title,description,reviews
0,0000031909,Girls Ballet Tutu Neon Pink,,Grandchildren love these! I purchased these fo...
1,0000032034,Adult Ballet Tutu Yellow,,
2,0000913154,The Way Things Work: An Illustrated Encycloped...,,Five Stars Book is awsome || Great if you want...
3,0001360000,Mog's Kittens,Judith Kerr's bestselling adventures of that ...,Pretty kittens! In a tearful heartwarming stor...
4,0001381245,Misty of Chincoteague,,Gift Birthday gift for a friend. She loved it ...
...,...,...,...,...
2248614,B00LV5XHIK,Enhance your gaming experience with these simp...,The Ortz analog thumb grip stick covers not on...,I threw these on all 4 of my controllers and t...
2248615,B00LV8PDF2,Charge both of your PS4 controllers simulatani...,,Five Stars excellent || Five Stars Excellent |...
2248616,B00LVHLRZ8,Apache Paracord Type III 7 Strand 550 Paracord...,,
2248617,BT008G9O8G,Cont Removable Paper Label,,Very NIce Transaction Even though I recieved a...


In [None]:
trn_df.to_csv(f'{data_dir}/raw_data/train_review-description.csv', index=False)

### Test

In [None]:
review_mat_file = f'{data_dir}/review_title_text_tst_X_Y.npz'
description_mat_file = f'{data_dir}/description_tst_X_Y.npz'
title_file = f'{data_dir}/raw_data/test.raw.txt'

tst_df = get_text_with_top_review(review_mat_file, description_mat_file, title_file)

  0%|          | 0/970237 [00:00<?, ?it/s]

  0%|          | 0/970237 [00:00<?, ?it/s]

In [None]:
tst_df

Unnamed: 0,identifier,title,description,reviews
0,0000032069,Adult Ballet Tutu Cheetah Pink,3 layers of super-soft polyester tulle can be ...,Awesome Need a costume? Always wanted to wear ...
1,0000589012,Why Don't They Just Quit? DVD Roundtable Discu...,,
2,0000031852,Girls Ballet Tutu Zebra Hot Pink,This tutu is great for dress up play for your ...,This is one of the cutest tu tu ever! I also p...
3,0000032050,Adult Ballet Tutu Purple,,
4,0001203088,Hilda Boswell's Omnibus - A Treasury of Favorites,,"Classic, heirloom book I grew up with this boo..."
...,...,...,...,...
970232,B00LOCL99Y,,,Five Stars Excellent || Five Stars Thanks || F...
970233,B00LNSQIC2,BeWild Brand® - LeBron James Forgiven Welcome ...,,"Awesome shirt Awesome shirt, great price, I ju..."
970234,B00LOLBBQQ,Kanex KTU10 Thunderbolt to eSATA Plus USB 3.0 ...,Easily connect your Thunderbolt-equipped Mac o...,Work as stated... Good speed... || Does what i...
970235,B00LOWJ6JY,,,"Five Stars Honestly, wh-wh-what could you say ..."


In [None]:
tst_df.to_csv(f'{data_dir}/raw_data/test_review-description.csv', index=False)

### Label

In [None]:
review_mat_file = f'{data_dir}/review_title_text_lbl_X_Y.npz'
description_mat_file = f'{data_dir}/description_lbl_X_Y.npz'
title_file = f'{data_dir}/raw_data/label.raw.txt'

lbl_df = get_text_with_top_review(review_mat_file, description_mat_file, title_file)

  0%|          | 0/1305265 [00:00<?, ?it/s]

  0%|          | 0/1305265 [00:00<?, ?it/s]

In [None]:
lbl_df

Unnamed: 0,identifier,title,description,reviews
0,0000032050,Adult Ballet Tutu Purple,,
1,B00D0DJAEG,Adult Ballet Tutu Pastel Rainbow,3 layers of super-soft polyester tulle can be ...,Good fit Bought this for my mom for Halloween....
2,B00D0F450I,"Adult Ballet Tutu Black, one size fit most",Dance tutu for teenagers / adults. Perfect for...,Five Stars Great product. Loved it || Five Sta...
3,B00D2JTMS2,Adult Tutu Assorted Colors (Hot Pink),3 layers of super-soft polyester tulle can be ...,Five Stars Great color || good product. Fast s...
4,B00D0FDUAY,Adult Ballet Tutu Red,,
...,...,...,...,...
1305260,B00KIY40CW,1pc Large L Grey Replacement Band With Clasp f...,,
1305261,B00I3XWRT8,EOZY Dazzing Clear Crystal CZ White Love Heart...,,EOZY 32*20mm Dazzing Clear Crystal CZ White Lo...
1305262,B00KRO1OZ4,The Grandfather | Funny Father's Day Grandpa G...,,Grandpa Loves It. My Dad proudly wears this sh...
1305263,B00L89DVV2,TriFly® Kids 3-in-1 TriFly Toddler Scooter Wit...,,


In [None]:
lbl_df.to_csv(f'{data_dir}/raw_data/label_review-description.csv', index=False)

### `review-description`

In [None]:
from sugar.core import *

In [None]:
trn_df = pd.read_csv(f'{data_dir}/raw_data/train_review-description.csv')

In [None]:
tst_df = pd.read_csv(f'{data_dir}/raw_data/test_review-description.csv')

In [None]:
lbl_df = pd.read_csv(f'{data_dir}/raw_data/label_review-description.csv')

In [None]:
trn_df.fillna('', inplace=True)
tst_df.fillna('', inplace=True)
lbl_df.fillna('', inplace=True)

In [None]:
ids = trn_df['identifier']
raw = [f'{x} {y} {z}' for x,y,z in zip(trn_df['title'], trn_df['description'], trn_df['reviews'])]

In [None]:
save_raw_file(f'{data_dir}/raw_data/train.review_description.csv', ids, raw)

In [None]:
ids = tst_df['identifier']
raw = [f'{x} {y} {z}' for x,y,z in zip(tst_df['title'], tst_df['description'], tst_df['reviews'])]
save_raw_file(f'{data_dir}/raw_data/test.review_description.csv', ids, raw)

In [None]:
ids = lbl_df['identifier']
raw = [f'{x} {y} {z}' for x,y,z in zip(lbl_df['title'], lbl_df['description'], lbl_df['reviews'])]
save_raw_file(f'{data_dir}/raw_data/label.review_description.csv', ids, raw)