# Amazon Review 2018

> https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/

- run `sugar/process_amazon2018_reviews.py` to create the reviews dataset.

In [5]:
#| default_exp amazon_helper

In [4]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [3]:
#| export
import requests, os, gzip, json, scipy.sparse as sp, numpy as np, argparse, pandas as pd, multiprocessing as mp
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm.auto import tqdm

## Download

In [13]:
def get_urls(url):
    response = requests.get(url)
    assert response.status_code == 200, f'Invalid url: {url}'
    soup = BeautifulSoup(response.text, 'html.parser')
    return [link.get('href') for link in soup.find_all('a') if link.get('href').endswith('.gz')]

def download(url, fname):
    file_response = requests.get(url, stream=True)
    with open(fname, 'wb') as f:
        for chunk in file_response.iter_content(chunk_size=1024):
            f.write(chunk)

def download_amazon_dataset(url, data_dir):
    os.makedirs(data_dir, exist_ok=True)
    file_links = get_urls(url)
    for link in tqdm(file_links):
        furl, fname = urljoin(url, link), os.path.join(data_dir, link)
        download(furl, fname)


In [21]:
url = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/metaFiles2/'
cache_dir = '/scratch/scai/phd/aiz218323/Projects/sugar/data/amazon_review_2018/cache/products'

In [11]:
url = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/'
cache_dir = '/scratch/scai/phd/aiz218323/Projects/sugar/data/amazon_review_2018/cache/reviews'

In [12]:
output_dir = '/scratch/scai/phd/aiz218323/Projects/sugar/data/amazon_review_2018'

In [8]:
download_amazon_dataset(url, cache_dir)

## Delete invalid rows and columns

In [45]:
#| export
def remove_invalid_lbl(item2col, item_info):
    invalid_lbl, valid_item2col = [], {}
    for o in item2col:
        if o in item_info: 
            valid_item2col[o] = len(valid_item2col)
        else: 
            invalid_lbl.append(item2col[o])
    return valid_item2col, invalid_lbl

def remove_invalid_data(item2row, matrix):
    invalid_data, valid_item2row = [], {}
    valid_data_mask = matrix.getnnz(axis=1) > 0
    
    for o,m in zip(item2row, valid_data_mask):
        if m: valid_item2row[o] = len(valid_item2row)
        else: invalid_data.append(item2row[o])
            
    return valid_item2row, invalid_data
    

In [46]:
#| export
def delete_rows(matrix, invalid_idx):
    num_lbl = matrix.shape[1]
    
    prev_idx, num_deleted_items = 0, 0
    data, indices, indptr = [], [], []
    m_data, m_indices, m_indptr = matrix.data, matrix.indices, matrix.indptr
    
    for idx in invalid_idx:
        start, end = m_indptr[prev_idx], m_indptr[idx]

        if end > start:
            data.append(m_data[start:end])
            indices.append(m_indices[start:end])
        indptr.append(m_indptr[prev_idx:idx] - num_deleted_items)
        
        prev_idx = idx+1
        num_deleted_items += m_indptr[idx+1] - m_indptr[idx]

    start = m_indptr[prev_idx]
    data.append(m_data[start:])
    indices.append(m_indices[start:])
    indptr.append(m_indptr[prev_idx:] - num_deleted_items)

    data, indices, indptr = np.hstack(data), np.hstack(indices), np.hstack(indptr)
    num_data = len(indptr)-1
    
    return sp.csr_matrix((data, indices, indptr), shape=(num_data, num_lbl), dtype=matrix.dtype)
    

In [47]:
#| export
def clean_dataset(matrix, item2row, item2col, item_info):
    valid_item2col, invalid_lbl = remove_invalid_lbl(item2col, item_info)
    valid_matrix_t = delete_rows(matrix.transpose().tocsr(), invalid_lbl)
    valid_matrix = valid_matrix_t.transpose().tocsr()

    valid_item2row, invalid_data = remove_invalid_data(item2row, valid_matrix)
    valid_matrix = delete_rows(valid_matrix, invalid_data)
    
    return valid_matrix, valid_item2row, valid_item2col
    

## Sequentially constructing `data_lbl`

> Item inserted if it has `identifier` and `title`.

In [14]:
#| export
def construct_dataset(files):
    item_info, item2row, item2col = {}, {}, {}
    data, indices, indptr = [], [], [0]

    for file_idx,fname in enumerate(files):
        with gzip.open(fname, 'rt', encoding='utf-8') as f:
            items = [json.loads(d) for d in f]

        progress_bar = None
        for item in items:
            if progress_bar is None:
                progress_bar = tqdm(total=len(items), unit='items', desc=f'File {file_idx+1}')
            progress_bar.update(1)
            
            identifier = item['asin'] 
            short_text = item['title'] if 'title' in item else None
            full_text = ''
            if 'description' in item:
                full_text += ''.join(item['description'])
            elif 'feature' in item:
                full_text += ''.join(item['feature'])

            category = item['category'] if 'category' in item else None
            also_view = item['also_view'] if 'also_view' in item else None
            brand = item['brand'] if 'brand' in item else None
            similar_item = item['similar_item'] if 'similar_item' in item else None
            
            if identifier and len(identifier) > 0 and short_text and len(short_text) > 0:
                if identifier not in item_info:
                    item_info[identifier] = {'short_text': short_text, 'full_text': full_text, 'category': category, 
                                             'also_view': also_view, 'brand': brand, 'similar_item': similar_item}
                    
                    if ('also_buy' in item) and (identifier not in item2row):
                        item2row.setdefault(identifier, len(item2row))
                        data.extend([1] * len(item['also_buy']))
                        indices.extend([item2col.setdefault(o, len(item2col)) for o in item['also_buy']])
                        indptr.append(len(indices))
                
    matrix = sp.csr_matrix((data, indices, indptr), dtype=np.float16)
    return item_info, item2row, item2col, matrix
        

In [48]:
#| export
def save_dataset(output_dir, matrix, item2row, item2col, item_info):
    data_lbl = f'{output_dir}/data_lbl.npz'
    sp.save_npz(data_lbl, matrix)

    data_info = pd.DataFrame({'identifier':list(item2row), 'text':[item_info[o]['short_text'] for o in item2row]})
    data_info.to_csv(f'{output_dir}/data_info.csv', index=False)

    lbl_info = pd.DataFrame({'identifier':list(item2col), 'text':[item_info[o]['short_text'] for o in item2col]})
    lbl_info.to_csv(f'{output_dir}/lbl_info.csv', index=False)

    with open(f'{output_dir}/item_info.json', 'w') as f:
        json.dump(item_info, f)
    

In [49]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_url', type=str, required=True)
    parser.add_argument('--cache_dir', type=str, required=True)
    parser.add_argument('--output_dir', type=str, required=True)
    return parser.parse_args()
    

### Run

In [50]:
files = [f'{cache_dir}/{file}' for file in os.listdir(cache_dir) if file.endswith('.gz')][3:5]
item_info_1, item2row_1, item2col_1, matrix_1 = construct_dataset(files)

File 1:   0%|          | 0/933 [00:00<?, ?items/s]

File 2:   0%|          | 0/120310 [00:00<?, ?items/s]

In [803]:
matrix_1.astype(np.float32).todense()

matrix([[1., 1., 0., 0., 0.],
        [0., 0., 1., 1., 1.],
        [0., 0., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 0., 0., 0., 1.]], dtype=float32)

In [804]:
item_info_1, item2row_1, item2col_1

({'a': {'short_text': 'a', 'full_text': ''},
  'b': {'short_text': 'b', 'full_text': ''},
  'c': {'short_text': 'c', 'full_text': ''},
  'd': {'short_text': 'd', 'full_text': ''},
  'e': {'short_text': 'e', 'full_text': ''}},
 {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4},
 {'a': 0, 'd': 1, 'b': 2, 'c': 3, 'f': 4})

In [805]:
np.array(output)[:, [0, 3, 1, 2]]

array([[1, 1, 0, 0],
       [0, 0, 1, 1],
       [0, 0, 1, 0],
       [1, 1, 1, 1]])

In [755]:
item_info_1, item2row_1, item2col_1, matrix_1 = construct_dataset(files)

File 1:   0%|          | 0/808 [00:00<?, ?items/s]

File 2:   0%|          | 0/839 [00:00<?, ?items/s]

File 3:   0%|          | 0/994 [00:00<?, ?items/s]

In [21]:
valid_matrix, valid_item2row, valid_item2col = clean_dataset(matrix_1, item2row_1, item2col_1, item_info_1)

In [807]:
valid_item2row, valid_item2col

({'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}, {'a': 0, 'd': 1, 'b': 2, 'c': 3})

In [808]:
valid_matrix.todense()

matrix([[1., 1., 0., 0.],
        [0., 0., 1., 1.],
        [0., 0., 1., 0.],
        [1., 1., 1., 1.],
        [1., 0., 0., 0.]], dtype=float32)

In [35]:
save_dataset(output_dir, valid_matrix, valid_item2row, valid_item2col, item_info_1)

In [292]:
valid_item2col, invalid_lbl = remove_invalid_lbl(item2col, item_info)

valid_matrix_t = delete_rows(matrix.transpose().tocsr(), invalid_lbl)
valid_matrix = valid_matrix_t.transpose().tocsr()

In [293]:
valid_item2row, invalid_data = remove_invalid_data(item2row, valid_matrix)

valid_matrix = delete_rows(valid_matrix, invalid_data)

In [296]:
data_lbl = f'{output_dir}/data_lbl.npz'
sp.save_npz(data_lbl, valid_matrix)

In [297]:
data_info = pd.DataFrame({'identifier':list(valid_item2row), 'text':[item_info[o]['short_text'] for o in valid_item2row]})
data_info.to_csv(f'{output_dir}/data_info.csv', index=False)

In [298]:
lbl_info = pd.DataFrame({'identifier':list(valid_item2col), 'text':[item_info[o]['short_text'] for o in valid_item2col]})
lbl_info.to_csv(f'{output_dir}/lbl_info.csv', index=False)

## Multi-processing 1

In [18]:
def construct_chunk_matrix(items):
    item_info, item2row, item2col = {}, {}, {}
    data, indices, indptr = [], [], [0]
    
    for item in items:
        identifier = item['asin'] 
        short_text = item['title'] if 'title' in item else None
        full_text = ''
        if 'description' in item:
            full_text += ''.join(item['description'])
        elif 'feature' in item:
            full_text += ''.join(item['feature'])
        
        if identifier and len(identifier) > 0 and short_text and len(short_text) > 0:
            item_info[identifier] = {'short_text': short_text, 'full_text': full_text}
            
            if ('also_buy' in item) and (identifier not in item2row):
                item2row.setdefault(identifier, len(item2row))
                data.extend([1] * len(item['also_buy']))
                indices.extend([item2col.setdefault(o, len(item2col)) for o in item['also_buy']])
                indptr.append(len(indices))
                
    matrix = sp.csr_matrix((data, indices, indptr), shape=(len(item2row), len(item2col)), dtype=np.float16)
    return item_info, item2row, item2col, matrix
    

In [19]:
def remove_invalid_rows(item2row, matrix, invalid_idx):
    valid_item2row = {}
    
    ctr = 0
    for o,i in item2row.items():
        if ctr < len(invalid_idx) and i == invalid_idx[ctr]:
            ctr += 1
        else:
            valid_item2row[o] = len(valid_item2row)

    valid_matrix = delete_rows(matrix, invalid_idx)
    return valid_item2row, valid_matrix
        

In [544]:
files = [f'{cache_dir}/{file}' for file in os.listdir(cache_dir) if file.endswith('.gz')]

In [613]:
files = files[1:2]

In [547]:
for file_idx,fname in enumerate(files):
    with gzip.open(fname, 'rt', encoding='utf-8') as f:
        items = [json.loads(d) for d in f]
    break
        

In [600]:
chunk_size = 10_000
chunks = [construct_chunk_matrix(items[i:i+chunk_size]) for i in range(0, len(items), chunk_size)]

In [569]:
item_info, item2row, item2col, matrix = chunks[0]

In [570]:
for chunk in chunks:
    info, row, col, mat = chunk
    
    offset = len(item2row)
    invalid_rows = [i for o,i in row.items() if o in item2row]
    valid_row, valid_mat = remove_invalid_rows(row, mat, invalid_rows)
    valid_row = {k:v+offset for k,v in valid_row.items()}

    offset = len(item2col)
    col_remap = {i:item2col[o] if o in item2col else i+offset for o,i in col.items()}
    valid_col = {o:item2col[o] if o in item2col else i+offset for o,i in col.items()}

    valid_indices = [col_remap[i] for i in valid_mat.indices]
    r,c = valid_mat.shape[0], max(v for v in col_remap.values())
    valid_matrix = sp.csr_matrix((valid_mat.data, valid_indices, valid_mat.indptr), shape=(r,c), dtype=valid_mat.dtype)

    matrix.resize(matrix.shape[0], valid_matrix.shape[1])

    item_info.update(info)
    item2row.update(valid_row)
    item2col.update(valid_col)
    matrix = sp.vstack([matrix, valid_matrix])
    

In [571]:
matrix

<32487x70383 sparse matrix of type '<class 'numpy.float16'>'
	with 102955 stored elements in Compressed Sparse Row format>

In [20]:
def construct_dataset(files, chunk_size=10_000):
    nprocs = max(1, os.cpu_count()//2)
    with mp.Pool(nprocs) as pool:
        
        item_info, item2row, item2col, matrix = None, None, None, None

        for file_idx,fname in enumerate(files):
            with gzip.open(fname, 'rt', encoding='utf-8') as f:
                items = [json.loads(d) for d in f]
                chunks = [items[i:i+chunk_size] for i in range(0, len(items), chunk_size)]

            progress_bar = tqdm(total=len(items), unit="items", desc=f"File {file_idx}")
            
            for chunk in pool.imap(construct_chunk_matrix, chunks, chunksize=2):
                progress_bar.update(chunk_size)
                
                if item_info is None:
                    item_info, item2row, item2col, matrix = chunk
                    continue
                    
                info, row, col, mat = chunk
                
                offset = len(item2row)
                invalid_rows = [i for o,i in row.items() if o in item2row]
                valid_row, valid_mat = remove_invalid_rows(row, mat, invalid_rows)
                valid_row = {k:v+offset for k,v in valid_row.items()}
            
                new_cols, offset = 0, len(item2col)
                col_remap, valid_col = {}, {}
                for o,i in col.items():
                    if o in item2col:
                        valid_col[o] = col_remap[i] = item2col[o]
                    else:
                        valid_col[o] = col_remap[i] = new_cols+offset
                        new_cols += 1
                    
                valid_indices = [col_remap[i] for i in valid_mat.indices]
                r,c = valid_mat.shape[0], offset+new_cols
                valid_matrix = sp.csr_matrix((valid_mat.data, valid_indices, valid_mat.indptr), shape=(r,c), dtype=valid_mat.dtype)
            
                matrix.resize(matrix.shape[0], valid_matrix.shape[1])
            
                item_info.update(info)
                item2row.update(valid_row)
                item2col.update(valid_col)
                matrix = sp.vstack([matrix, valid_matrix])
                
    return item_info, item2row, item2col, matrix
                

### Run

In [780]:
files = [f'{cache_dir}/{file}' for file in os.listdir(cache_dir) if file.endswith('.gz')]
item_info_2, item2row_2, item2col_2, matrix_2 = construct_dataset(files)

File 0:   0%|          | 0/933 [00:00<?, ?items/s]

File 1:   0%|          | 0/32892 [00:00<?, ?items/s]

File 2:   0%|          | 0/186637 [00:00<?, ?items/s]

In [737]:
item_info

{'a': {'short_text': 'a', 'full_text': ''},
 'b': {'short_text': 'b', 'full_text': ''},
 'c': {'short_text': 'c', 'full_text': ''},
 'd': {'short_text': 'd', 'full_text': ''}}

In [738]:
item2row

{'a': 0, 'b': 1, 'c': 2, 'd': 3}

In [739]:
matrix.astype(np.float32).todense()

matrix([[1., 1., 0., 0.],
        [0., 0., 1., 1.],
        [0., 0., 1., 0.],
        [1., 1., 1., 1.]], dtype=float32)

In [740]:
np.array(output)[:, [0, 3, 1, 2]]

array([[1, 1, 0, 0],
       [0, 0, 1, 1],
       [0, 0, 1, 0],
       [1, 1, 1, 1]])

In [761]:
item_info_2, item2row_2, item2col_2, matrix_2 = construct_dataset(files)

File 0:   0%|          | 0/808 [00:00<?, ?items/s]

File 1:   0%|          | 0/839 [00:00<?, ?items/s]

File 2:   0%|          | 0/994 [00:00<?, ?items/s]

In [781]:
np.all(matrix_1.data == matrix_2.data), np.all(matrix_1.indices == matrix_2.indices), np.all(matrix_1.indptr == matrix_2.indptr)

(True, True, True)

In [782]:
item_info_1 == item_info_2, item2row_1 == item2row_2, item2col_1 == item2col_2

(True, True, True)

In [744]:
def get_examples(max_items=1_000, max_indices=30, num_examples=3, seed=None):
    if seed is not None:
        np.random.seed(seed)
        
    examples = []
    for _ in range(num_examples):
        items = []
        for _ in range(np.random.randint(int(0.8 * max_items), max_items)):
            title = ''.join(map(chr, np.random.randint(97, 123, size=3)))
            also_buy = list(map(chr, np.random.randint(97, 123, size=np.random.randint(2, max_indices))))
            item = {'asin': title, 'title': title, 'also_buy': also_buy}
            items.append(item)
        examples.append(items)
    return examples
    

In [751]:
examples = get_examples(seed=100)

In [18]:
output = [[1, 0, 0, 1], [0, 1, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]]

In [19]:
examples = [[ {'asin': 'a', 'title': 'a', 'also_buy': ['a', 'd']}, {'asin': 'b', 'title': 'b', 'also_buy': ['b', 'c']}],
            [{'asin': 'c', 'title': 'c', 'also_buy': ['b']},{'asin': 'd', 'title': 'd', 'also_buy': ['a', 'b', 'c', 'd']}]]

In [None]:
examples = [[ {'asin': 'a', 'title': 'a', 'also_buy': ['a', 'd']}, {'asin': 'b', 'title': 'b', 'also_buy': ['b', 'c', 'f']}],
                [{'asin': 'c', 'title': 'c', 'also_buy': ['b']},{'asin': 'd', 'title': 'd', 'also_buy': ['a', 'b', 'c', 'd']}],
                [{'asin': 'e', 'title': 'e', 'also_buy': ['f']}]]

## Multi-processing 2

In [71]:
def construct_file_matrix(fname):
    with gzip.open(fname, 'rt', encoding='utf-8') as f:
        items = [json.loads(d) for d in f]
        
    item_info, item2row, item2col = {}, {}, {}
    data, indices, indptr = [], [], [0]
    
    for item in tqdm(items):
        identifier = item['asin'] 
        short_text = item['title'] if 'title' in item else None
        full_text = ''
        if 'description' in item:
            full_text += ''.join(item['description'])
        elif 'feature' in item:
            full_text += ''.join(item['feature'])

        category = item['category'] if 'category' in item else None
        also_view = item['also_view'] if 'also_view' in item else None
        brand = item['brand'] if 'brand' in item else None
        similar_item = item['similar_item'] if 'similar_item' in item else None
        
        if identifier and len(identifier) > 0 and short_text and len(short_text) > 0:
            item_info[identifier] = {'short_text': short_text, 'full_text': full_text, 'category': category, 
                                     'also_view': also_view, 'brand': brand, 'similar_item': similar_item}
            
            if ('also_buy' in item) and (identifier not in item2row):
                item2row.setdefault(identifier, len(item2row))
                data.extend([1] * len(item['also_buy']))
                indices.extend([item2col.setdefault(o, len(item2col)) for o in item['also_buy']])
                indptr.append(len(indices))
                
    matrix = sp.csr_matrix((data, indices, indptr), shape=(len(item2row), len(item2col)), dtype=np.float16)
    return item_info, item2row, item2col, matrix
    

In [72]:
def remove_invalid_rows(item2row, matrix, invalid_idx):
    valid_item2row = {}
    
    ctr = 0
    for o,i in item2row.items():
        if ctr < len(invalid_idx) and i == invalid_idx[ctr]:
            ctr += 1
        else:
            valid_item2row[o] = len(valid_item2row)

    valid_matrix = delete_rows(matrix, invalid_idx)
    return valid_item2row, valid_matrix
    

In [73]:
def construct_dataset(files):
    nprocs = max(1, os.cpu_count()//2)
    with mp.Pool(nprocs) as pool:
        
        item_info, item2row, item2col, matrix = None, None, None, None
        
        progress_bar = tqdm(total=len(files), unit="items")
        
        for chunk in pool.imap(construct_file_matrix, files, chunksize=1):
            
            if item_info is None:
                item_info, item2row, item2col, matrix = chunk
                progress_bar.update(1)
                continue
                
            info, row, col, mat = chunk
            
            offset = len(item2row)
            invalid_rows = [i for o,i in row.items() if o in item2row]
            valid_row, valid_mat = remove_invalid_rows(row, mat, invalid_rows)
            valid_row = {k:v+offset for k,v in valid_row.items()}
        
            new_cols, offset = 0, len(item2col)
            col_remap, valid_col = {}, {}
            for o,i in col.items():
                if o in item2col:
                    valid_col[o] = col_remap[i] = item2col[o]
                else:
                    valid_col[o] = col_remap[i] = new_cols+offset
                    new_cols += 1
                
            valid_indices = [col_remap[i] for i in valid_mat.indices]
            r,c = valid_mat.shape[0], offset+new_cols
            valid_matrix = sp.csr_matrix((valid_mat.data, valid_indices, valid_mat.indptr), shape=(r,c), dtype=valid_mat.dtype)
        
            matrix.resize(matrix.shape[0], valid_matrix.shape[1])
        
            item_info.update(info)
            item2row.update(valid_row)
            item2col.update(valid_col)
            matrix = sp.vstack([matrix, valid_matrix])
            
            progress_bar.update(1)
                
    return item_info, item2row, item2col, matrix
                

### Run

In [74]:
files = [f'{cache_dir}/{file}' for file in os.listdir(cache_dir) if file.endswith('.gz')][3:5]

In [75]:
item_info_3, item2row_3, item2col_3, matrix_3 = construct_dataset(files)

  0%|          | 0/2 [00:00<?, ?items/s]

In [76]:
np.all(matrix_1.data == matrix_3.data), np.all(matrix_1.indices == matrix_3.indices), np.all(matrix_1.indptr == matrix_3.indptr)

(True, True, True)

In [77]:
item_info_1 == item_info_3, item2row_1 == item2row_3, item2col_1 == item2col_3

(True, True, True)

## Metadata

In [890]:
#| export
def construct_meta_dataset(item_info, item2data, item2lbl, metatag):
    item2meta = {}

    def get_metadata(item2xy):
        data, indices, indptr = [], [], [0]
        progress_bar = tqdm(total=len(item2xy), unit='items')
        for identifier in item2xy:
            if metatag in item_info[identifier]:
                meta = item_info[identifier][metatag]
                if meta is not None:
                    data.extend([1] * len(meta))
                    indices.extend([item2meta.setdefault(o, len(item2meta)) for o in meta])
            indptr.append(len(indices))
            progress_bar.update(1)
        r,c = len(item2xy), len(item2meta)
        return sp.csr_matrix((data, indices, indptr), shape=(r,c), dtype=np.float16)
        
    data_meta, lbl_meta = get_metadata(item2data), get_metadata(item2lbl)
    data_meta.resize(len(item2data),len(item2meta))
    
    return item2meta, data_meta, lbl_meta
    

In [866]:
files = [f'{cache_dir}/{file}' for file in os.listdir(cache_dir) if file.endswith('.gz')]
item_info, item2row, item2col, matrix = construct_dataset(files)

File 1:   0%|          | 0/933 [00:00<?, ?items/s]

File 2:   0%|          | 0/32892 [00:00<?, ?items/s]

File 3:   0%|          | 0/186637 [00:00<?, ?items/s]

In [867]:
valid_matrix, valid_item2row, valid_item2col = clean_dataset(matrix, item2row, item2col, item_info)

In [844]:
with gzip.open(files[0], 'rt', encoding='utf-8') as f:
    items = [json.loads(d) for d in f]

In [845]:
items[0].keys()

dict_keys(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat', 'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes'])

In [846]:
items[100]['category']

['Alexa Skills', 'Lifestyle', 'Self Improvement']

### `category`

In [887]:
item2category, data_category, lbl_category = construct_meta_dataset(item_info, valid_item2row, valid_item2col, 'category')

  0%|          | 0/2882 [00:00<?, ?items/s]

  0%|          | 0/2300 [00:00<?, ?items/s]

In [891]:
#| export
def save_category(output_dir, data_meta, lbl_meta, meta_info, meta_tag):
    sp.save_npz(f'{output_dir}/data_{meta_tag}.npz', data_meta)
    sp.save_npz(f'{output_dir}/lbl_{meta_tag}.npz', lbl_meta)

    meta_info = pd.DataFrame({'text':list(meta_info)})
    meta_info.to_csv(f'{output_dir}/{meta_tag}_info.csv', index=False)
    

### `also_view`

In [888]:
item2alsoview, data_alsoview, lbl_alsoview = construct_meta_dataset(item_info, valid_item2row, valid_item2col, 'also_view')

  0%|          | 0/2882 [00:00<?, ?items/s]

  0%|          | 0/2300 [00:00<?, ?items/s]

In [892]:
#| export
def save_meta_dataset(output_dir, data_meta, lbl_meta, meta_info, item_info, meta_tag):
    sp.save_npz(f'{output_dir}/data_{meta_tag}.npz', data_meta)
    sp.save_npz(f'{output_dir}/lbl_{meta_tag}.npz', lbl_meta)
    
    meta_info = pd.DataFrame({'identifier':list(meta_info), 'text':[item_info[o]['short_text'] for o in meta_info]})
    meta_info.to_csv(f'{output_dir}/{meta_tag}_info.csv', index=False)
    

### Main function

In [None]:
if __name__ == '__main__':
    from timeit import default_timer as timer
    start_time = timer()
    
    args = parse_args()

    data_info = pd.read_csv(f'{args.output_dir}/data_info.csv')
    item2row = {identifier:i for identifier,i in enumerate(data_info['identifier'])}

    lbl_info = pd.read_csv(f'{args.output_dir}/lbl_info.csv')
    item2col = {identifier:i for identifier,i in enumerate(lbl_info['identifier'])}

    with open(f'{args.output_dir}/item_info.json', 'r') as f:
        item_info = json.load(f)

    item2category, data_category, lbl_category = construct_meta_dataset(item_info, item2row, item2col, 'category')
    save_category(args.output_dir, data_category, lbl_category, item2category, 'category')

    end_time = timer()
    print(f'Time elapsed: {end_time-start_time:.2f} seconds.')
    

## Remapping

In [3]:
data_dir = '/home/scai/phd/aiz218323/Projects/XC/data/(mapped)LF-AmazonTitles-1.3M/raw_data/'

In [4]:
def read_raw_file(fname):
    raw_data = {}
    with open(fname, 'r', encoding='utf-8') as f:
        for line in f:
            identifier, short_text = line[:-1].split('->', maxsplit=1)
            raw_data[identifier] = short_text
    return raw_data
    

In [7]:
train_raw = read_raw_file(f'{data_dir}/train.raw.txt')
test_raw = read_raw_file(f'{data_dir}/test.raw.txt')
label_raw = read_raw_file(f'{data_dir}/label.raw.txt')

In [9]:
files = [f'{cache_dir}/{file}' for file in os.listdir(cache_dir) if file.endswith('.gz')]

In [11]:
with gzip.open(files[0], 'rt', encoding='utf-8') as f:
    items = [json.loads(d) for d in f]

## User reviews

In [19]:
data_dir = '/home/scai/phd/aiz218323/scratch/Projects/sugar/data/amazon_review_2018/cache/reviews'

In [38]:
filenames = [f'{data_dir}/{n}' for n in os.listdir(data_dir)]
fsize = [os.path.getsize(o) for o in filenames]

filenames = [filenames[i] for i in np.argsort(fsize)]

In [45]:
with gzip.open(filenames[5], 'rt', encoding='utf-8') as file:
    users = [json.loads(d) for d in file]

In [50]:
users[30]

{'overall': 1.0,
 'verified': False,
 'reviewTime': '01 27, 2016',
 'reviewerID': 'A3KSCKP9WYB99V',
 'asin': 'B01080IRRO',
 'style': {'Color:': ' Black'},
 'reviewerName': 'K. Doane',
 'reviewText': "Won't sense the finger gesture, and when it does its super sensitive and hard to control. Really bummed they made this look great it really doesn't work with as many systems as you would think.",
 'summary': 'Cannot get this to work',
 'unixReviewTime': 1453852800}

In [52]:
review_file = '/home/scai/phd/aiz218323/scratch/Projects/sugar/data/amazon_review_2018/reviews_5-core/reviews.json'

In [53]:
with open(review_file, 'r') as file:
    reviews = json.load(file)

In [55]:
product_dir = '/home/scai/phd/aiz218323/scratch/Projects/sugar/data/amazon_review_2018/products'
data_lbl = sp.load_npz(f'{product_dir}/data_lbl.npz')
data_info = pd.read_csv(f'{product_dir}/data_info.csv')
lbl_info = pd.read_csv(f'{product_dir}/lbl_info.csv')

In [61]:
user2items = {}
for item_id, users in tqdm(reviews.items(), total=len(reviews)):
    for user_id in users:
        items = user2items.setdefault(user_id, set())
        items.add(item_id)
        

  0%|          | 0/1403999 [00:00<?, ?it/s]

In [64]:
print('Number of users: ', len(user2items))
print('Number of labels: ', data_lbl.shape[1])

Number of users:  4259758
Number of labels:  2646426


In [106]:
data_dir = '/home/scai/phd/aiz218323/scratch/Projects/sugar/data/amazon_review_2018/reviews_5-core/'

In [None]:
__User items

In [107]:
# User items
idx = np.random.randint(len(user2items), size=1000)
user_ids = list(user2items.keys())
user_ids = [user_ids[i] for i in idx]

In [108]:
item2title = dict(list(zip(data_info['identifier'], data_info['text'])))
item2title.update(dict(list(zip(lbl_info['identifier'], lbl_info['text']))))

In [124]:
user2titles = {}
for user_id in user_ids:
    user2titles[user_id] = [item2title[o] for o in list(user2items[user_id])[:10]]

with open(f'{data_dir}/user2items_1000.json', 'w') as file:
    file.write(json.dumps(user2titles, sort_keys=True, indent=4, separators=(',', ': ')))

__Item reviews__

In [94]:
# Item reviews
idx = np.random.randint(len(reviews), size=1000)
item_ids = list(reviews.keys())
item_ids = [item_ids[i] for i in idx]

In [97]:
item2reviews = {}
for item_id in tqdm(item_ids):
    user2reviews = reviews[item_id]
    idx = np.random.randint(len(user2reviews), size=10)
    user_ids = list(user2reviews.keys())
    user_ids = [user_ids[i] for i in idx]
    item2reviews[item2title[item_id]] = {o:user2reviews[o] for o in user_ids}
    

  0%|          | 0/1000 [00:00<?, ?it/s]

In [122]:
import json
with open(f'{data_dir}/item2reviews_1000.json', 'w') as file:
    file.write(json.dumps(item2review, sort_keys=True, indent=4, separators=(',', ': ')))

In [104]:
print('Number of reviews per item: ', np.mean([len(v) for v in reviews.values()]))
print('Number of items per user: ', np.mean([len(v) for v in user2items.values()]))

Number of reviews per item:  40.72859026252868
Number of items per user:  13.423978545260082


In [105]:
data_dir

'/home/scai/phd/aiz218323/scratch/Projects/sugar/data/amazon_review_2018/cache/reviews_5-core/'

__(user, item) pairs__

In [131]:
pair_file = f'{data_dir}/user-item_pairs.txt'

In [155]:
user2items, item2col = {}, {}
with open(pair_file, 'r') as file:
    for o in file:
        user_id, item_id = o[:-1].split()
        items = user2items.setdefault(user_id, [])
        items.append(item2col.setdefault(item_id, len(item2col)))
        