# Amazon Review 2023

> https://amazon-reviews-2023.github.io/

In [1]:
#| default_exp 02_amazon-review-2023

In [39]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [93]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [43]:
#| export
import requests, os, gzip, json, scipy.sparse as sp, numpy as np, argparse, pandas as pd, multiprocessing as mp
from tqdm.auto import tqdm
from huggingface_hub import hf_hub_download

In [5]:
#| export
def get_categories(data_dir):
    fname = os.path.join(data_dir, 'all_categories.txt')
    with open(fname, 'r') as f:
        categories = f.read().split('\n')[:-1]
    return categories
    

In [82]:
cache_dir = '/scratch/scai/phd/aiz218323/Projects/sugar/data/amazon_review_2023/cache'

In [14]:
categories = get_categories(cache_dir)

## EDA

In [75]:
remote_name = all_categories[1]

In [77]:
remote_name = 'Electronics'

In [78]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{remote_name}", split="full", trust_remote_code=True)

Downloading data:   0%|          | 0.00/5.25G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

In [79]:
for data in tqdm(dataset):
    if data['bought_together'] is not None: break

  0%|          | 0/1610012 [00:00<?, ?it/s]

In [80]:
dataset[10]

{'main_category': 'Computers',
 'title': 'Network Magic 4.0',
 'average_rating': 3.3,
 'rating_number': 14,
 'features': ['Connect, Manage, and Secure Your Network',
  'Share printers and files between all of your computers',
  'Connect and repair your network and internet connection',
  'View a live map of network; protect wireless network from intruders',
  'Print from any computer; monitor Internet use and Web sites visited'],
 'description': ['Product description',
  'You try to connect your computers and devices together to share a printer, files and maintain an internet connection, but find yourself wasting hours sorting through confusing dialog boxes. It is time to stop with the hassles and let Network Magic do the work for you! With Network Magic you can easily 1) connect, manage, and secure your network; 2) share printers and files between all of your computers; 3) monitor Internet use and web sites visited; 4) connect and repair your network and internet connection; and 5) Pr

## Construct dataset

In [51]:
#| export
def construct_dataset(files):
    item_info, item2row, item2col = {}, {}, {}
    data, indices, indptr = [], [], [0]

    for file_idx,fname in enumerate(files):
        with open(fname, 'r', encoding='utf-8') as f:
            items = [json.loads(d) for d in f]

        progress_bar = None
        for item in items:
            if progress_bar is None:
                progress_bar = tqdm(total=len(items), unit='items', desc=f'File {file_idx+1}')
            progress_bar.update(1)
            
            identifier = item['parent_asin'] 
            short_text = item['title'] if 'title' in item else None
            full_text = ''
            if ('description' in item) and (item['description'] is not None):
                full_text += ''.join(item['description'])
            elif ('features' in item) and (item['features'] is not None):
                full_text += ''.join(item['features'])

            category = item['categories'] if 'categories' in item else None
            store = item['store'] if 'store' in item else None
            details = item['details'] if 'details' in item else None
            
            if identifier and len(identifier) > 0 and short_text and len(short_text) > 0:
                item_info[identifier] = {'short_text': short_text, 'full_text': full_text, 'category': category, 
                                         'store': store, 'details': details}
                
                if ('bought_together' in item) and (item['bought_together'] is not None) and (identifier not in item2row):
                    item2row.setdefault(identifier, len(item2row))
                    data.extend([1] * len(item['bought_together']))
                    indices.extend([item2col.setdefault(o, len(item2col)) for o in item['bought_together']])
                    indptr.append(len(indices))
                    
    r,c = len(item2row), len(item2col)
    matrix = sp.csr_matrix((data, indices, indptr), shape=(r,c), dtype=np.float16)
    return item_info, item2row, item2col, matrix
    

In [33]:
#|export 
def download_amazon_dataset(data_dir):
    categories = get_categories(args.cache_dir)
    for category in tqdm(categories):
        hf_hub_download(repo_id='McAuley-Lab/Amazon-Reviews-2023', filename=f'raw/meta_categories/meta_{category}.jsonl', 
                        repo_type="dataset", local_dir=data_dir)
        

In [34]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cache_dir', type=str, required=True)
    parser.add_argument('--output_dir', type=str, required=True)
    return parser.parse_args()
    

In [35]:
cache_dir = '/scratch/scai/phd/aiz218323/Projects/sugar/data/amazon_review_2023/cache'
output_dir = '/scratch/scai/phd/aiz218323/Projects/sugar/data/amazon_review_2023'

In [50]:
#| export
from sugar.amazon_review_2018 import clean_dataset, save_dataset

In [113]:
#| export
if __name__ == '__main__':
    from timeit import default_timer as timer
    start_time = timer()
    
    args = parse_args()

    files = [f'{args.cache_dir}/raw/meta_categories/{file}' for file in os.listdir(f'{args.cache_dir}/raw/meta_categories') if file.endswith('.jsonl')]
    item_info, item2row, item2col, matrix = construct_dataset(files)

    valid_matrix, valid_item2row, valid_item2col = clean_dataset(matrix, item2row, item2col, item_info)

    save_dataset(args.output_dir, valid_matrix, valid_item2row, valid_item2col, item_info)

    end_time = timer()
    print(f'Time elapsed: {end_time-start_time:.2f} seconds.')
        

'/scratch/scai/phd/aiz218323/Projects/sugar/data/amazon_review_2023/cache/raw/meta_categories/meta_All_Beauty.jsonl'

In [26]:
with open(files[0], 'r') as f:
    items = [json.loads(l) for l in f]

In [27]:
items[0]

{'main_category': 'Grocery',
 'title': 'Dark Roast Pure Coffee',
 'average_rating': 4.7,
 'rating_number': 9,
 'features': [],
 'description': [],
 'price': None,
 'images': [{'thumb': 'https://m.media-amazon.com/images/I/51LY15EJoFL._SX38_SY50_CR,0,0,38,50_.jpg',
   'large': 'https://m.media-amazon.com/images/I/51LY15EJoFL.jpg',
   'variant': 'MAIN',
   'hi_res': 'https://m.media-amazon.com/images/I/81ucKSAeinL._SL1500_.jpg'}],
 'videos': [{'title': 'YouCut_20220Cafe Du Monde Coffee Chicory, 15 Ounce Ground coffee',
   'url': 'https://www.amazon.com/vdp/08a1120516ab4b2184f486f940c0de45?ref=dp_vse_rvc_0',
   'user_id': '/shop/influencer-d62c132e'},
  {'title': 'CDM New Orleans',
   'url': 'https://www.amazon.com/vdp/0ab93bf7ed5646f68d1fa8f00cd30160?ref=dp_vse_rvc_1',
   'user_id': ''}],
 'store': 'Luzianne',
 'categories': ['Grocery & Gourmet Food', 'Beverages', 'Coffee'],
 'details': {'Brand': 'Luzianne',
  'Item Form': 'Ground',
  'Caffeine Content': 'caffeinated',
  'Roast Level': '