### References - Recommender w/ NLP and Graphs

1. [Building a Strong Baseline Recommender in PyTorch, on a Laptop](https://eugeneyan.com/writing/recommender-systems-baseline-pytorch/)
2. [Beating the Baseline Recommender with Graph & NLP in Pytorch](https://eugeneyan.com/writing/recommender-systems-graph-and-nlp-pytorch/)

In [1]:
import argparse
import datetime

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import roc_auc_score
from torch import optim
from torch.utils.data import DataLoader

In [2]:
from parse_json import parse_json_to_df

In [3]:
shuffle = True
emb_dim = 128
epochs = 5
initial_lr = 0.01

In [4]:
!ls -lah data/

total 22383064
drwxr-xr-x   5 rsilvei  813930708   160B Jun 25 13:59 [1m[36m.[m[m
drwxr-xr-x  11 rsilvei  813930708   352B Jun 25 14:04 [1m[36m..[m[m
-rw-r--r--   1 rsilvei  813930708   8.8G Jun 10 19:01 reviews_Books_5.json
-rw-r--r--   1 rsilvei  813930708   1.4G Jun 10 18:37 reviews_Electronics_5.json
-rw-r--r--@  1 rsilvei  813930708   473M Jun 25 13:59 reviews_Electronics_5.json.gz


In [13]:
#electronics_path = "data/reviews_Electronics_5.json.gz"
#electronics_csv = "data/electronics.csv"
books_path = "data/reviews_Books_5.json.gz"
books_csv = "data/books.csv"

In [None]:
df = parse_json_to_df(books_path)

2021-06-25 14:25:21,431 - Rows processed: 10,000
2021-06-25 14:25:22,132 - Rows processed: 20,000
2021-06-25 14:25:23,031 - Rows processed: 30,000
2021-06-25 14:25:23,635 - Rows processed: 40,000
2021-06-25 14:25:24,428 - Rows processed: 50,000
2021-06-25 14:25:25,208 - Rows processed: 60,000
2021-06-25 14:25:26,375 - Rows processed: 70,000
2021-06-25 14:25:27,092 - Rows processed: 80,000
2021-06-25 14:25:28,437 - Rows processed: 90,000
2021-06-25 14:25:29,002 - Rows processed: 100,000
2021-06-25 14:25:29,541 - Rows processed: 110,000
2021-06-25 14:25:30,043 - Rows processed: 120,000
2021-06-25 14:25:30,669 - Rows processed: 130,000
2021-06-25 14:25:31,239 - Rows processed: 140,000
2021-06-25 14:25:31,718 - Rows processed: 150,000
2021-06-25 14:25:32,212 - Rows processed: 160,000
2021-06-25 14:25:32,746 - Rows processed: 170,000
2021-06-25 14:25:33,376 - Rows processed: 180,000
2021-06-25 14:25:33,884 - Rows processed: 190,000
2021-06-25 14:25:34,443 - Rows processed: 200,000
2021-06-2

2021-06-25 14:26:51,627 - Rows processed: 1,630,000
2021-06-25 14:26:52,097 - Rows processed: 1,640,000
2021-06-25 14:26:52,583 - Rows processed: 1,650,000
2021-06-25 14:26:53,049 - Rows processed: 1,660,000
2021-06-25 14:26:53,459 - Rows processed: 1,670,000
2021-06-25 14:26:53,885 - Rows processed: 1,680,000
2021-06-25 14:26:54,334 - Rows processed: 1,690,000
2021-06-25 14:26:54,750 - Rows processed: 1,700,000
2021-06-25 14:26:55,125 - Rows processed: 1,710,000
2021-06-25 14:26:55,563 - Rows processed: 1,720,000
2021-06-25 14:26:55,965 - Rows processed: 1,730,000
2021-06-25 14:26:56,557 - Rows processed: 1,740,000
2021-06-25 14:26:57,011 - Rows processed: 1,750,000
2021-06-25 14:26:57,466 - Rows processed: 1,760,000
2021-06-25 14:26:57,930 - Rows processed: 1,770,000
2021-06-25 14:26:58,407 - Rows processed: 1,780,000
2021-06-25 14:26:58,896 - Rows processed: 1,790,000
2021-06-25 14:26:59,307 - Rows processed: 1,800,000
2021-06-25 14:26:59,734 - Rows processed: 1,810,000
2021-06-25 1

2021-06-25 14:28:26,324 - Rows processed: 3,210,000
2021-06-25 14:28:26,895 - Rows processed: 3,220,000
2021-06-25 14:28:27,425 - Rows processed: 3,230,000
2021-06-25 14:28:28,057 - Rows processed: 3,240,000
2021-06-25 14:28:28,823 - Rows processed: 3,250,000
2021-06-25 14:28:29,355 - Rows processed: 3,260,000
2021-06-25 14:28:29,830 - Rows processed: 3,270,000
2021-06-25 14:28:30,371 - Rows processed: 3,280,000
2021-06-25 14:28:31,390 - Rows processed: 3,290,000
2021-06-25 14:28:32,138 - Rows processed: 3,300,000
2021-06-25 14:28:32,720 - Rows processed: 3,310,000
2021-06-25 14:28:33,479 - Rows processed: 3,320,000
2021-06-25 14:28:34,242 - Rows processed: 3,330,000
2021-06-25 14:28:35,225 - Rows processed: 3,340,000
2021-06-25 14:28:35,979 - Rows processed: 3,350,000
2021-06-25 14:28:36,629 - Rows processed: 3,360,000
2021-06-25 14:28:37,251 - Rows processed: 3,370,000
2021-06-25 14:28:37,770 - Rows processed: 3,380,000
2021-06-25 14:28:38,296 - Rows processed: 3,390,000
2021-06-25 1

2021-06-25 14:32:15,525 - Rows processed: 4,790,000
2021-06-25 14:32:16,013 - Rows processed: 4,800,000
2021-06-25 14:32:16,617 - Rows processed: 4,810,000
2021-06-25 14:32:17,176 - Rows processed: 4,820,000
2021-06-25 14:32:17,682 - Rows processed: 4,830,000
2021-06-25 14:32:18,189 - Rows processed: 4,840,000
2021-06-25 14:32:18,697 - Rows processed: 4,850,000
2021-06-25 14:32:19,196 - Rows processed: 4,860,000
2021-06-25 14:32:19,714 - Rows processed: 4,870,000
2021-06-25 14:32:20,229 - Rows processed: 4,880,000
2021-06-25 14:32:20,739 - Rows processed: 4,890,000
2021-06-25 14:32:21,405 - Rows processed: 4,900,000
2021-06-25 14:32:22,035 - Rows processed: 4,910,000
2021-06-25 14:32:22,594 - Rows processed: 4,920,000
2021-06-25 14:32:23,148 - Rows processed: 4,930,000
2021-06-25 14:32:23,637 - Rows processed: 4,940,000
2021-06-25 14:32:24,221 - Rows processed: 4,950,000
2021-06-25 14:32:24,753 - Rows processed: 4,960,000
2021-06-25 14:32:25,263 - Rows processed: 4,970,000
2021-06-25 1

In [7]:
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,ao94dhgc771sj,0528881469,amazdnu,,we got this gps for my husband who is an (otr)...,5.0,gotta have gps!,1370131200,"06 2, 2013"
1,amo214lnfcei4,0528881469,amazon customer,,"i'm a professional otr truck driver, and i bou...",1.0,very disappointed,1290643200,"11 25, 2010"
2,a3n7t0dy83y4ig,0528881469,c. a. freeman,,"well, what can i say. i've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,a1h8py3qhmqqa0,0528881469,"dave m. shaw ""mack dave""",,"not going to write a long review, even thought...",2.0,"great grafics, poor gps",1290556800,"11 24, 2010"
4,a24ev6rxelqz63,0528881469,wayne smith,,i've had mine for a year and here's what we go...,1.0,"major issues, only excuses for support",1317254400,"09 29, 2011"
...,...,...,...,...,...,...,...,...,...
1689183,a34bzm6s9l7qi4,b00lgq6hl8,"candy cane ""is it just me?""",,burned these in before listening to them for a...,5.0,boom -- pop -- pow. these deliver.,1405555200,"07 17, 2014"
1689184,a1g650tttheal5,b00lgq6hl8,"charles spanky ""zumina reviews""",,some people like dj style headphones or earbud...,5.0,"thin and light, without compromising on sound ...",1405382400,"07 15, 2014"
1689185,a25c2m3qf9g7oq,b00lgq6hl8,comdet,,i&#8217;m a big fan of the brainwavz s1 (actua...,5.0,same form factor and durability as the s1 with...,1405555200,"07 17, 2014"
1689186,a1e1levq9vqnk,b00lgq6hl8,j. chambers,,"i've used thebrainwavz s1 in ear headphones, a...",5.0,superb audio quality in a very comfortable set...,1405641600,"07 18, 2014"


In [9]:
df.to_csv(write_books, index=False)

In [11]:
!ls -lah data/

total 24809632
drwxr-xr-x   6 rsilvei  813930708   192B Jun 25 14:09 [1m[36m.[m[m
drwxr-xr-x  11 rsilvei  813930708   352B Jun 25 14:10 [1m[36m..[m[m
-rw-r--r--   1 rsilvei  813930708   1.2G Jun 25 14:10 books.csv
-rw-r--r--   1 rsilvei  813930708   8.8G Jun 10 19:01 reviews_Books_5.json
-rw-r--r--   1 rsilvei  813930708   1.4G Jun 10 18:37 reviews_Electronics_5.json
-rw-r--r--@  1 rsilvei  813930708   473M Jun 25 13:59 reviews_Electronics_5.json.gz


In [8]:
def get_category_lvl(category_list: list, lvl=0) -> str:
    try:
        return category_list[lvl]
    except IndexError:
        return 'NA_VALUE'


def get_categories(df: pd.DataFrame) -> pd.DataFrame:
    df['category_lvl_1'] = df['categories'].apply(get_category_lvl, args=(0,))
    df['category_lvl_2'] = df['categories'].apply(get_category_lvl, args=(1,))
    df['category_lvl_3'] = df['categories'].apply(get_category_lvl, args=(2,))
    df['category_lvl_4'] = df['categories'].apply(get_category_lvl, args=(3,))
    logger.info('Categories lvl 1 - 4 prepared')

    return df


def get_meta(df: pd.DataFrame) -> pd.DataFrame:
    # Update to reflect if relationship exist
    df['related'] = np.where(df['related'].isnull(), 0, 1)

    # Prep categories
    df['categories'] = df['categories'].apply(eval)
    df['categories'] = df['categories'].apply(lambda x: x[0])  # Get first category only
    df = get_categories(df)

    # Prep title and description
    # TODO: Add cleaning of title and description

    return df

In [12]:
META_COLS = ['asin', 'categories', 'title', 'description', 'price', 'brand', 'related']
df = pd.read_csv(write_books, error_bad_lines=False, warn_bad_lines=True,
                 dtype={'asin': 'str', 'title': 'str', 'brand': 'str'},
                 usecols=META_COLS)
logger.info('DF shape: {}'.format(df.shape))

meta_df = get_meta(df)

meta_df.to_csv(args.write_path, index=False)

ValueError: Usecols do not match columns, columns expected but not found: ['description', 'title', 'categories', 'price', 'related', 'brand']