# Prepare for SLiRec, SASRec

#### Inputs
- all_reviews_20.csv, all_meta_20.csv

#### Outputs
- train_data, valid_data, test_data
- valid_data, test_data includes negative samples (4 in valid_data, 49 in test_data)
- user_vocab.pkl, item_vocab.pkl, category_vocab.pkl
- reviews.out (also used in the Jupyter notebook for SASRec model which does NOT have any other preprocessing requirement)

In [1]:
DATA_DIR = '/home/shiv/Documents/DataScience/Capstone/Data/'
DATA_DIR_SLIREC = '/home/shiv/Documents/DataScience/Capstone/Data/slirec/'

In [2]:
import pandas as pd
import os
import _pickle as cPickle
import random

In [1]:
MIN_REVIEWS = 20   # constants
TEST_NUM_NGS = 49
VALID_NUM_NGS = 4

ratings_df = pd.read_csv(DATA_DIR + f'all_reviews_{MIN_REVIEWS}.csv', header=None)
ratings_df.columns = ['reviewerID', 'asin', 'rating', 'unixTimeStamp']      

In [2]:
items_df = pd.read_csv(DATA_DIR + f'all_meta_{MIN_REVIEWS}.csv', header=None)
items_df.columns=['asin','price','title','main_cat','category']
items_df['category'].fillna('', inplace=True)
items_df['price'].fillna('$$$', inplace=True)

In [3]:
len(sorted(items_df['main_cat'].unique()))

45

In [5]:
# Helper functions

def write_reviews(data_dir, ratings_filtered_df):
    reviews_writefile = data_dir + '/reviews.output'
    reviews_w = open(reviews_writefile, 'w')
    for _, row in ratings_filtered_df.iterrows():
        reviews_w.write(
            row["reviewerID"]
            + "\t"
            + row["asin"]
            + "\t"
            + str(row["unixTimeStamp"])
            + "\n"
        )
    reviews_w.close()
    return reviews_writefile

def write_meta(data_dir, items_df):
    meta_writefile = data_dir + "/meta.output"
    meta_w = open(meta_writefile, "w")

    for _, row in items_df.iterrows():
        meta_w.write(row["asin"] + "\t" + row["main_cat"] + "\n")
    meta_w.close()
    return meta_writefile

def write_instance_output(reviews_file, meta_file):
    # For every user, create a list of items reviews sorted by ascending timestamp
    print("start create instances...")
    dirs, _ = os.path.split(reviews_file)
    output_file = os.path.join(dirs, "instance.output")

    f_reviews = open(reviews_file, "r")
    user_dict = {}
    item_list = []
    
    for line in f_reviews:
        line = line.strip()
        reviews_things = line.split("\t") # user_id asin unix_ts
        if reviews_things[0] not in user_dict:
            user_dict[reviews_things[0]] = []
        user_dict[reviews_things[0]].append((line, float(reviews_things[-1]))) # note append whole line, ts
        item_list.append(reviews_things[1])

    f_meta = open(meta_file, "r")
    meta_dict = {}
    for line in f_meta:
        line = line.strip()
        meta_things = line.split("\t") # asin category
        if meta_things[0] not in meta_dict:
            meta_dict[meta_things[0]] = meta_things[1]

    f_output = open(output_file, "w")
    num_default_cat = 0
    for user_id in user_dict:
        sorted_user_behavior = sorted(user_dict[user_id], key=lambda x: x[1]) # x[1]: tuple (line, ts)
        for line, _ in sorted_user_behavior:
            user_things = line.split("\t") # user_id, asin, ts
            asin = user_things[1]
            if asin in meta_dict:
                f_output.write("1" + "\t" + line + "\t" + meta_dict[asin] + "\n") # positive
            else:
                num_default_cat += 1
                f_output.write("1" + "\t" + line + "\t" + "default_cat" + "\n") # positive

    f_reviews.close()
    f_meta.close()
    f_output.close()
    print("Num default categories:", num_default_cat)
    assert(num_default_cat == 0)
    return output_file

def write_preprocessed_output(sampled_instance_file):
    print("start data processing...")
    dirs, _ = os.path.split(sampled_instance_file)
    output_file = os.path.join(dirs, "preprocessed.output")

    f_input = open(sampled_instance_file, "r")
    f_output = open(output_file, "w")
    user_count = {}

    for line in f_input:
        line = line.strip()
        user = line.split("\t")[1] # ["label", "user_id", "item_id", "timestamp", "cate_id"]
        if user not in user_count:
            user_count[user] = 0
        user_count[user] += 1

    f_input.seek(0)
    i = 0
    last_user = None
    for line in f_input:
        line = line.strip()
        user = line.split("\t")[1]
        if user == last_user:
            if i < user_count[user] - 2:
                f_output.write("train" + "\t" + line + "\n")
            elif i < user_count[user] - 1:
                f_output.write("valid" + "\t" + line + "\n")
            else:
                f_output.write("test" + "\t" + line + "\n")
        else:
            last_user = user
            i = 0
            if i < user_count[user] - 2:
                f_output.write("train" + "\t" + line + "\n")
            elif i < user_count[user] - 1:
                f_output.write("valid" + "\t" + line + "\n")
            else:
                f_output.write("test" + "\t" + line + "\n")
        i += 1
    return output_file

def write_train_valid_test_data(data_dir, output_file):
    train_file = os.path.join(data_dir, r'train_data')
    valid_file = os.path.join(data_dir, r'valid_data')
    test_file = os.path.join(data_dir, r'test_data')
    
    f_input = open(output_file, "r") # preprocessed.output
    f_train = open(train_file, "w")
    f_valid = open(valid_file, "w")
    f_test = open(test_file, "w")
    min_sequence = 1

    print("train, valid, test positive data generating...")
    last_user_id = None
    for line in f_input:
        line_split = line.strip().split("\t")
        tfile = line_split[0]        # train/valid/test
        label = int(line_split[1])   # label 1
        user_id = line_split[2]      # user
        item_id = line_split[3]      # asin
        date_time = line_split[4]    # ts
        category = line_split[5]     # category

        if tfile == "train":
            fo = f_train
        elif tfile == "valid":
            fo = f_valid
        elif tfile == "test":
            fo = f_test
        if user_id != last_user_id:
            item_id_list = [] # collect all items
            cate_list = []
            dt_list = []
        else:
            history_clk_num = len(item_id_list)
            cat_str = ""
            iid_str = ""
            dt_str = ""
            for c1 in cate_list:
                cat_str += c1 + ","
            for iid in item_id_list:
                iid_str += iid + ","
            for dt_time in dt_list:
                dt_str += dt_time + ","
            if len(cat_str) > 0:
                cat_str = cat_str[:-1]
            if len(iid_str) > 0:
                iid_str = iid_str[:-1]
            if len(dt_str) > 0:
                dt_str = dt_str[:-1]
            if history_clk_num >= min_sequence:
                fo.write(
                    line_split[1]
                    + "\t"
                    + user_id
                    + "\t"
                    + item_id
                    + "\t"
                    + category
                    + "\t"
                    + date_time
                    + "\t"
                    + iid_str
                    + "\t"
                    + cat_str
                    + "\t"
                    + dt_str
                    + "\n"
                )
        last_user_id = user_id
        if label:
            item_id_list.append(item_id)
            cate_list.append(category)
            dt_list.append(date_time)
    return train_file, valid_file, test_file

In [6]:
def gen_vocab_pkl_files(data_dir, train_file):
    user_vocab = os.path.join(data_dir, r'user_vocab.pkl')
    item_vocab = os.path.join(data_dir, r'item_vocab.pkl')
    cate_vocab = os.path.join(data_dir, r'category_vocab.pkl')

    f_train = open(train_file, "r") # NOTE: only train_file

    user_dict = {}
    item_dict = {}
    cat_dict = {}

    print("vocab generating...")
    for line in f_train:
        arr = line.strip("\n").split("\t") # label uid asin category ts asin_list, cat_list, ts_list
        uid = arr[1]
        iid = arr[2]
        cat = arr[3]
        iid_list = arr[5]
        cat_list = arr[6]

        if uid not in user_dict:
            user_dict[uid] = 0
        user_dict[uid] += 1
        if iid not in item_dict:
            item_dict[iid] = 0
        item_dict[iid] += 1
        if cat not in cat_dict:
            cat_dict[cat] = 0
        cat_dict[cat] += 1
        if len(iid_list) == 0:
            print("No history", uid)
            continue
        for m in iid_list.split(","):
            if m not in item_dict:
                item_dict[m] = 0
            item_dict[m] += 1
        for c in cat_list.split(","):
            if c not in cat_dict:
                cat_dict[c] = 0
            cat_dict[c] += 1

    sorted_user_dict = sorted(user_dict.items(), key=lambda x: x[1], reverse=True) # sorted on popularity
    sorted_item_dict = sorted(item_dict.items(), key=lambda x: x[1], reverse=True)
    sorted_cat_dict = sorted(cat_dict.items(), key=lambda x: x[1], reverse=True)

    uid_voc = {}
    index = 0
    for key, value in sorted_user_dict:
        uid_voc[key] = index
        index += 1

    iid_voc = {}
    iid_voc["default_mid"] = 0
    index = 1
    for key, value in sorted_item_dict:
        iid_voc[key] = index
        index += 1

    cat_voc = {}
    cat_voc["default_cat"] = 0
    index = 1
    for key, value in sorted_cat_dict:
        cat_voc[key] = index
        index += 1

    cPickle.dump(uid_voc, open(user_vocab, "wb"))
    cPickle.dump(iid_voc, open(item_vocab, "wb"))
    cPickle.dump(cat_voc, open(cate_vocab, "wb"))    

In [7]:
def gen_neg_samples(sampled_instance_file, item2cate, valid_file, test_file):
    columns = ["label", "user_id", "item_id", "timestamp", "cate_id"]
    ns_df = pd.read_csv(sampled_instance_file, sep="\t", names=columns)
    items_with_popular = list(ns_df["item_id"])

    # valid negative sampling
    print("start valid negative sampling")
    with open(valid_file, "r") as f:
        valid_lines = f.readlines()
    write_valid = open(valid_file, "w")
    for line in valid_lines:
        write_valid.write(line)
        words = line.strip().split("\t")
        positive_item = words[2]
        count = 0
        neg_items = set()
        while count < VALID_NUM_NGS:
            neg_item = random.choice(items_with_popular)
            if neg_item == positive_item or neg_item in neg_items:
                continue
            count += 1
            neg_items.add(neg_item)
            words[0] = "0"
            words[2] = neg_item
            words[3] = item2cate[neg_item]
            write_valid.write("\t".join(words) + "\n")

    # test negative sampling
    print("start test negative sampling")
    with open(test_file, "r") as f:
        test_lines = f.readlines()
    write_test = open(test_file, "w")
    for line in test_lines:
        write_test.write(line)
        words = line.strip().split("\t")
        positive_item = words[2]
        count = 0
        neg_items = set()
        while count < TEST_NUM_NGS:
            neg_item = random.choice(items_with_popular)
            if neg_item == positive_item or neg_item in neg_items:
                continue
            count += 1
            neg_items.add(neg_item)
            words[0] = "0"
            words[2] = neg_item
            words[3] = item2cate[neg_item]
            write_test.write("\t".join(words) + "\n")    

### Steps for preparing the outputs required by the SLi-Rec model

1. filter the items dataframe for the main categories under "Electronics"
2. filter the ratings dataframe to only include reviews for the main categories
3. write the reviews output file (reviews preprocessing)
4. write the meta data output file (meta preprocessing)
5. create the instance.output
6. create item2category dictionary
7. split into train (all records for a user except the last 2), validation (second last), test (last), write preprocessed.output
8. generate vocabulary files
9. negative sampling (validation, test only); train is done inline

In [8]:
def prepare_dataset(lst_main_cat, lst_sub_cat, items_df, ratings_df, dataset):
    data_dir = DATA_DIR_SLIREC + dataset + '/'
    
    # Step 1: filter the items dataframe for the main categories under "Electronics"
    items_filtered_df = items_df[items_df['main_cat'].isin(lst_main_cat)].copy()
    items_filtered_ids = items_filtered_df['asin'].values

    # Step 2: filter the ratings dataframe
    ratings_filtered_df = ratings_df[ratings_df['asin'].isin(items_filtered_ids)].copy()
    
    print(dataset)
    print("Num ratings:", ratings_filtered_df.shape[0])
    print("users", ratings_filtered_df['reviewerID'].nunique(), "items", ratings_filtered_df['asin'].nunique())
    
    # Step 3: write the reviews output file (reviews preprocessing)
    # reviews_writefile = data_dir + 'reviews.output'
    reviews_writefile = write_reviews(data_dir, ratings_filtered_df)
    
    # Step 4: write the meta output file (meta preprocessing)
    # meta_writefile = data_dir + 'meta.output'
    meta_writefile = write_meta(data_dir, items_df)
    
    # Step 5: create the instance.output
    # instance_output = data_dir + 'instance.output'
    instance_output = write_instance_output(reviews_writefile, meta_writefile)
    
    # Step 6: create item2category dictionary
    instance_df = pd.read_csv(
        instance_output,
        sep="\t",
        names=["label", "user_id", "item_id", "timestamp", "cate_id"],
    )
    
    # tmp_df = instance_df[["item_id","cate_id"]].drop_duplicates()
    # print(tmp_df.shape)
    item2cate = instance_df.set_index("item_id")["cate_id"].to_dict() # item_id/asin: category
    print(instance_df.shape, instance_df['item_id'].nunique(), len(item2cate))
    del instance_df
    
    # everything in the instance.output is either train, valid or test
    sampled_instance_file = instance_output
    
    # Step 7: split into train (all records for a user except the last 2), 
    # validation (second last), test (last)
    # output_file = data_dir + 'preprocessed.output'
    output_file = write_preprocessed_output(sampled_instance_file)
    train_file, valid_file, test_file = write_train_valid_test_data(data_dir, output_file)

    # Step 8: generate vocabulary files        
    gen_vocab_pkl_files(data_dir, train_file)
    
    # Step 9: negative sampling (validation, test only); train is done inline
    gen_neg_samples(sampled_instance_file, item2cate, valid_file, test_file)

## Electronics

- Refer to the amzn_gen_input_wide_deep notebook on how the sub categories are picked.

In [9]:
# main_cat = 'Electronics'
lst_main_cat = ['All Electronics', 'Amazon Devices', 'Apple Products', 
                 'Camera & Photo', 'Car Electronics', 'Cell Phones & Accessories', 'Computers',
                 'Electronics', 'GPS & Navigation', 'Home Audio & Theater', 'Industrial & Scientific',
                 'Portable Audio & Accessories']

lst_sub_cat = ['Accessories','Computers & Accessories','Office Products','Video Games',
               'Accessories & Supplies','Tools & Home Improvement','Computer Accessories & Peripherals',
               'Audio & Video Accessories', 'Automotive', 'Office & School Supplies',
               'Car & Vehicle Electronics', 'Industrial & Scientific','Sports & Outdoors','Office Electronics',
               'Home & Kitchen','Musical Instruments','Portable Audio & Video','Electrical',
               'Clothing, Shoes & Jewelry','Toys & Games','Laptop Accessories','Home Audio',
               'Controllers','Computer Components','Sports & Fitness']

prepare_dataset(lst_main_cat, lst_sub_cat, items_df, ratings_df, 'Electronics')

Electronics
Num ratings: 5613183
users 830668 items 63725
start create instances...
Num default categories: 0
(5613183, 5) 63725 63725
start data processing...
train, valid, test positive data generating...
vocab generating...
start valid negative sampling
start test negative sampling
