# Prepare for wide_deep

Prepare Electronics, Home and Games datasets for wide & deep model.

#### Input
- all_reviews_20.csv, all_meta_20.csv

#### Output (includes positive & negative samples for NDCG, Hit Rate calculation)
- users_e_20.pkl, items_e_20.pkl
- wide_deep_amzn_e_20.csv, wide_deep_amzn_e_20_train.csv, wide_deep_amzn_e_20_test.csv
- amzn_e_tst_w_neg[0-5].txt

In [1]:
import sys
sys.path.append('../../recommenders') # if needed, adjust the path to Microsoft Recommenders clone

In [2]:
DATA_DIR = '/home/shiv/Documents/DataScience/Capstone/Data/'
DATA_DIR_WIDE_DEEP = '/home/shiv/Documents/DataScience/Capstone/Data/wide_deep/'

In [3]:
import pandas as pd
from tqdm import tqdm
import pickle
from recommenders.datasets.python_splitters import python_chrono_split

In [1]:
MIN_REVIEWS = 20
NUM_NEG_SAMPLES = 50

# read the ratings dataframe
ratings_df = pd.read_csv(DATA_DIR + f'all_reviews_{MIN_REVIEWS}.csv', header=None)
ratings_df.columns = ['reviewerID', 'asin', 'rating', 'unixTimeStamp']     

In [2]:
# read the items dataframe
items_df = pd.read_csv(DATA_DIR + f'all_meta_{MIN_REVIEWS}.csv', header=None)
# items_df.head()

In [3]:
items_df.columns=['asin','price','title','main_cat','category']
items_df['category'].fillna('', inplace=True)
items_df['price'].fillna('$$$', inplace=True)

In [4]:
# Sample to show how to drop a main category

# mag_subs = items_df[items_df['main_cat']=='Magazine Subscriptions']['asin'].values
# ratings_df=ratings_df[~ratings_df['asin'].isin(mag_subs)]
# items_df=items_df[~items_df['asin'].isin(mag_subs)]
# ratings_df.reset_index(inplace=True,drop=True)
# items_df.reset_index(inplace=True,drop=True)

In [5]:
len(sorted(items_df['main_cat'].unique()))

45

<a id="electronics"></a>
# Prepare Electronics, Home, Games datasets

Steps:

1. Filter the items dataframe to remove all items whose main_cat is not in the list of main_categories for each umbrella category (e.g. Electronics)
2. Filter the rating dataframe to remove all reviews that do not belong to the list of item IDs left in the items dataframe after step 1.
3. Convert hexadecimal reviewerID in the ratings dataframe to userID (0 based)
4. Save the reviewerID dict in a pkl file
5. Left merge ratings and items filtered dataframe on asin (hexadecimal)
6. Convert hexadecimal asin to itemID (0 based)
7. Save the itemID dict (for UI) in a pkl file
8. Add genre; currently, consider the main categories or if an item in category matches the sub category list
9. Save the prepared dataset for use by modeling

In [21]:
def prepare_dataset(lst_main_cat, lst_sub_cat, items_df, ratings_df, dataset):
    data_dir = DATA_DIR_WIDE_DEEP + dataset + '/'
    
    # Step 1: filter the items dataframe
    items_filtered_df = items_df[items_df['main_cat'].isin(lst_main_cat)].copy()
    items_filtered_ids = items_filtered_df['asin'].values

    # Step 2: filter the ratings dataframe
    ratings_filtered_df = ratings_df[ratings_df['asin'].isin(items_filtered_ids)].copy()
    
    print(dataset)
    print("Num ratings:", ratings_filtered_df.shape[0])
    print("users", ratings_filtered_df['reviewerID'].nunique(), "items", ratings_filtered_df['asin'].nunique())
    
    # Step 3: convert reviewerID to userID (0 based)
    reviewers_dict = {}
    reviewer_id = 0
    for _, row in ratings_filtered_df.iterrows():
        if row['reviewerID'] not in reviewers_dict:
            reviewers_dict[row['reviewerID']] = reviewer_id
            reviewer_id += 1
            
    ratings_filtered_df['userID'] = ratings_filtered_df['reviewerID'].apply(lambda x: reviewers_dict[x])

    # Step 4: save the reviewerID dict (perhaps for UI)
    users_pkl_path = data_dir + f'users_{dataset[0].lower()}_{MIN_REVIEWS}.pkl'

    with open(users_pkl_path, 'wb') as handle:
        pickle.dump(reviewers_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    ratings_filtered_df.drop(columns=['reviewerID'], inplace=True) # we will henceforth use userID

    # Step 5: left merge the ratings with items on asin
    data = ratings_filtered_df.merge(items_filtered_df, on=['asin'], how='left')
    print("merged dataframe", data.shape)
    data['category'] = data['category'].astype('string')
    
    # Step 6: convert asin to itemID (0 based)
    items_dict = {}
    item_id = 0

    for _, row in data.iterrows():
        if row['asin'] not in items_dict:
            items_dict[row['asin']] = item_id
            item_id += 1
    data['itemID'] = data['asin'].apply(lambda x: items_dict[x])
    
    # Step 7: save the itemID dict (for UI)
    items_pkl_path = data_dir + f'items_{dataset[0].lower()}_{MIN_REVIEWS}.pkl'

    with open(items_pkl_path, 'wb') as handle:
        pickle.dump(items_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)        
    data.drop(columns=['asin'], inplace=True) # we will henceforth use itemID
    
    # Step 8: add genre; 
    # currently, consider the main categories or if an item in category matches the sub category list
    data['genre'] = ''

    def update_category(row):
        categories = row['category']
        new_cat = []
        new_cat.append(row['main_cat'])
        # print(categories)
        for cat in categories.split('|'):
            if cat.strip() != '' and ((cat in lst_main_cat) or (cat in lst_sub_cat)):
                new_cat.append(cat)
        row['genre'] = '|'.join(new_cat)
        return row
    data = data.apply(update_category, axis=1)
    
    # Step 9: save the prepared dataset to be used by wide_deep model
    # data.drop(columns=['category'], inplace=True)
    data.reset_index(inplace=True, drop=True)
    data = data[['userID','itemID', 'rating','genre','unixTimeStamp','title','price','main_cat','category']]
    data.to_csv(data_dir + f'wide_deep_amzn_{dataset[0].lower()}_{MIN_REVIEWS}.csv', 
                 header=False, index=False)
    
    del items_filtered_df   # Save on RAM memory!
    del ratings_filtered_df
    del data

# Code to discover relevant sub categories to build the lst_sub_cat

In [None]:
from collections import defaultdict

ratings_g_df = pd.read_csv(DATA_DIR + 'wide_deep_amzn_g_20.csv', header=None, low_memory=False)
ratings_g_df.columns=['userID','itemID', 'rating','genre','unixTimeStamp','title','price','main_cat','category']
ratings_g_df['category'].fillna('', inplace=True)
ratings_g_df['price'].fillna('$$$', inplace=True)
    
lst = list(ratings_g_df['category'].unique())
word_count = defaultdict(int)
for l in lst:
    for w in l.split('|'):
        word_count[w] += 1
dict(sorted(word_count.items(), key=lambda item: item[1], reverse=True))

### Electronics

In [22]:
# main_cat = 'Electronics'
lst_main_cat = ['All Electronics', 'Amazon Devices', 'Apple Products', 
                 'Camera & Photo', 'Car Electronics', 'Cell Phones & Accessories', 'Computers',
                 'Electronics', 'GPS & Navigation', 'Home Audio & Theater', 'Industrial & Scientific',
                 'Portable Audio & Accessories']

lst_sub_cat = ['Accessories','Computers & Accessories','Office Products','Video Games',
               'Accessories & Supplies','Tools & Home Improvement','Computer Accessories & Peripherals',
               'Audio & Video Accessories', 'Automotive', 'Office & School Supplies',
               'Car & Vehicle Electronics', 'Industrial & Scientific','Sports & Outdoors','Office Electronics',
               'Home & Kitchen','Musical Instruments','Portable Audio & Video','Electrical',
               'Clothing, Shoes & Jewelry','Toys & Games','Laptop Accessories','Home Audio',
               'Controllers','Computer Components','Sports & Fitness']
prepare_dataset(lst_main_cat, lst_sub_cat, items_df, ratings_df, "Electronics")

Electronics
Num ratings: 5613183
users 830668 items 63725
merged dataframe (5613183, 8)


### Train, test split using python_chrono_split

**Run it once! It takes a very long time!!**
If you have already saved the two csv files, go to the next cell!

In [None]:
ratings_df = pd.read_csv(DATA_DIR_WIDE_DEEP + 'wide_deep_amzn_e_20.csv', header=None, low_memory=False)

train, test = python_chrono_split(ratings_df, ratio=0.9, col_timestamp='unixTimeStamp')

print("{} train samples and {} test samples".format(len(train), len(test)))

train.sort_values('unixTimeStamp', inplace=True)
train.reset_index(inplace=True, drop=True)
train.to_csv(DATA_DIR_WIDE_DEEP + 'wide_deep_amzn_e_20_train.csv', header=None, index=False)
test.to_csv(DATA_DIR_WIDE_DEEP + 'wide_deep_amzn_e_20_test.csv', header=None, index=False)

In [None]:
train = pd.read_csv(DATA_DIR_WIDE_DEEP + 'wide_deep_amzn_e_20_train.csv', header=None,
                    converters={3: lambda x: x.strip("[]").split(", ")})
train.columns = ['userID','itemID','rating','genre','unixTimeStamp']

test = pd.read_csv(DATA_DIR_WIDE_DEEP + 'wide_deep_amzn_e_20_test.csv', header=None,
                   converters={3: lambda x: x.strip("[]").split(", ")})
test.columns = ['userID','itemID','rating','genre','unixTimeStamp']

print("{} train samples and {} test samples".format(len(train), len(test)))

### Prepare to get ndcg@10, hit@10 for wide_n_deep 

In [None]:
train.drop(columns=['unixTimeStamp'], inplace=True)
test.drop(columns=['unixTimeStamp'], inplace=True)

users_grp = train[[USER_COL, ITEM_COL]].groupby([USER_COL]).agg(list)

items_df = train.drop_duplicates([ITEM_COL]).copy()
items_df.drop(columns=[USER_COL, RATING_COL], inplace=True)
items_df.set_index(ITEM_COL, inplace=True)

In [None]:
from tqdm import tqdm
from multiprocessing import Process, Queue
import random

items_set = set(train[ITEM_COL].unique())

### Process for generating the test data positive and negative samples

Note that the entire process takes time even when using multiprocessing module.

- Each worker takes a part of the test dataframe, so if there are 5 workers, each get 1/5th of the dataframe; worker # 5 gets to work a bit more to handle the remaining rows in the end
- Once a review is selected, that becomes the positive sample. NUM_NEG_SAMPLES are then found for this user.
- Each of the negative sample is unique and not seen by the user that wrote the review. Both positive and negative samples are written using the required csv format (userID, itemID, rating, genre)

In [None]:
N_WORKERS = 5

def sample_function(test, users_grp, items_df, i, num_workers, seed, data_dir):
    nrows = test.shape[0]
    each = nrows // num_workers
    start = i*each
    end = nrows if i == num_workers - 1 else (i+1)*each
    sym = ['!','@','#','$','%']
    
    # print(i, start, end, test.iloc[start:end].shape)
    random.seed(seed)
    tst_w_neg_samples_path = data_dir + f'amzn_e_tst_w_neg{i}.txt'
    
    user_col = []
    item_col = []
    rating_col = []
    feat_col = []
    for j, row in test.iloc[start:end].iterrows():
        u = row[USER_COL]
        positive_item = row[ITEM_COL]
        tmp_df = users_grp.loc[u]
        assert(tmp_df.shape[0] != 0)
        
        items_seen_set = set(tmp_df[ITEM_COL])
        items_not_seen_set = list(items_set - items_seen_set)
        user_col.append(u)
        item_col.append(positive_item)
        rating_col.append(row[RATING_COL])
        feat_col.append([int(f) for f in row[ITEM_FEAT_COL]])
        
        cnt = 0
        neg_items = set()
        while cnt < NUM_NEG_SAMPLES:
            neg_item = random.choice(list(items_not_seen_set))
            if neg_item == positive_item or neg_item in neg_items:
                continue
                
            cnt += 1
            tmp_df = items_df.loc[neg_item]
            assert(tmp_df.shape[0] != 0)
            
            user_col.append(u)
            item_col.append(neg_item)
            rating_col.append(5.0) # unused
            feat_col.append([int(f) for f in tmp_df[ITEM_FEAT_COL]])   
            
        if j % 10_000 == 0:
            print(sym[i], end='')

    X_test = pd.DataFrame({USER_COL: user_col, ITEM_COL: item_col, 
                           RATING_COL: rating_col, ITEM_FEAT_COL: feat_col})
    X_test.to_csv(tst_w_neg_samples_path, header=False, index=False)    

processors = []
for i in range(N_WORKERS):
    processors.append(
        Process(
            target = sample_function,
            args = (test, users_grp, items_df, i, N_WORKERS, RANDOM_SEED, DATA_DIR_WIDE_DEEP)
        ))
    # processors[-1].daemon = True
    processors[-1].start()

for i in range(N_WORKERS):
    processors[i].join()

### Home

In [24]:
# main_cat = 'Home'
lst_main_cat = ['Amazon Home', 'Appliances', 'Home & Kitchen', 
                'Patio, Lawn & Garden', 'Tools & Home Improvement']

lst_sub_cat = ['Kitchen & Dining', 'Industrial & Scientific', 'Power & Hand Tools',
               'Automotive', 'Arts, Crafts & Sewing', 'Office Products',
               'Electronics', 'Sports & Outdoors', 'Home Dcor', 'Accessories', 'Hand Tools',
               'Office & School Supplies', 'Gardening & Lawn Care',
               'Hardware', 'Storage & Organization', 'Lighting & Ceiling Fans',
               'Kitchen Utensils & Gadgets', 'Electrical', 'Furniture', 'Pet Supplies',
               'Building Supplies', 'Bedding', 'Sports & Fitness', 'Safety & Security',
               'Outdoor Recreation', 'Power Tool Parts & Accessories', 'Kitchen & Bath Fixtures',
               'Parts & Accessories', 'Small Appliances', 'Replacement Parts',
               'Crafting', 'Sewing', 'Tools & Equipment', 'Outdoor Dcor', 'Patio Furniture & Accessories',
               'Grills & Outdoor Cooking', 'Power Tools', 'Rough Plumbing', 'Bath',
               'Bakeware', 'Accessories & Supplies', 'Heating, Cooling & Air Quality',
               'Outdoor Power Tools', 'Outdoor Lighting', 'Paint, Wall Treatments & Supplies',
               'Home Dcor Accents', 'Pools, Hot Tubs & Supplies', 'Bathroom Fixtures',
               'Test, Measure & Inspect', 'Bathroom Accessories', 'Personal Protective Equipment',
               'Fasteners', 'Vacuums & Floor Care', 'Clothing & Closet Storage', 'Exterior Accessories',
               'Replacement Parts & Accessories', 'Desk Accessories & Workspace Organizers',
               'Outdoor Cooking Tools & Accessories', 'HVAC']
prepare_dataset(lst_main_cat, lst_sub_cat, items_df, ratings_df, "Home")

Home
Num ratings: 7575829
users 882963 items 92562
merged dataframe (7575829, 8)


### Games

In [29]:
# main_cat = 'Games'
lst_main_cat = ['Sports & Outdoors', 'Toys & Games', 'Video Games']
lst_sub_cat = ['Sports & Fitness', 'Accessories', 'Clothing, Shoes & Jewelry', 'Home & Kitchen',
               'Outdoor Recreation', 'Clothing', 'Electronics', 'Games', 
               'Sports & Outdoor Play', 'Cycling', 'Exercise & Fitness', 'Camping & Hiking',
               'Leisure Sports & Game Room', 'Hunting & Fishing', 'Retro Gaming & Microconsoles',
               'Costumes & Accessories', 'Dress Up & Pretend Play', 'Shoes', 'Golf', 'Hobbies',
               'Water Sports', 'Controllers', 'Xbox One', 'PlayStation 3', 'Motorcycle & Powersports',
               'Arts & Crafts', 'Replacement Parts', 'Xbox 360', 'Other Sports', 'Crafting',
               'Wii', 'Learning & Education', 'Sports', 'Active', 'Consoles',
               'Painting, Drawing & Art Supplies', 'Audio & Video Accessories', 'Nintendo 3DS & 2DS',
               'Athletic', 'Skates, Skateboards & Scooters', 'Building Toys', 'Building Sets']
prepare_dataset(lst_main_cat, lst_sub_cat, items_df, ratings_df, "Games")

Games
Num ratings: 3735507
users 748773 items 56219
merged dataframe (3735507, 8)


# Additional datasets: Books, Health & Personal Care, Fashion

In [None]:
# main_cat = 'Books'
lst_main_cat = ['Books', 'Kindle', 'Audible audiobooks']

In [None]:
# main_cat = 'Health & Personal Care'
lst_main_cat = ['All Beauty', 'Health & Personal Care', 'Luxury Beauty']

# main_cat = 'Fashion'
lst_main_cat = ['Amazon Fashion', 'Clothing, Shoes & Jewelry']

<a id="all_main_cat"></a>
## Use all the main categories; if only specific main categories are needed, jump to [Electronics](#electronics)

In [6]:
# Step 3: convert reviewerID to userID (0 based)
print("Converting reviewerID to userID")
reviewers_dict = {}
reviewer_id = 0
with tqdm(total=ratings_df.shape[0]) as pbar:
    for _, row in ratings_df.iterrows():
        if row['reviewerID'] not in reviewers_dict:
            reviewers_dict[row['reviewerID']] = reviewer_id
            reviewer_id += 1
        pbar.update(1)

ratings_df['userID'] = ratings_df['reviewerID'].apply(lambda x: reviewers_dict[x])

# Step 4: save the reviewerID dict (perhaps for UI)
users_pkl_path = DATA_DIR + f'users_all_{MIN_REVIEWS}.pkl'

with open(users_pkl_path, 'wb') as handle:
    pickle.dump(reviewers_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

del reviewers_dict
ratings_df.drop(columns=['reviewerID'], inplace=True) # we will henceforth use userID

# Step 5: left merge the ratings with items on asin
data = ratings_df.merge(items_df, on=['asin'], how='left')
print("merged dataframe", data.shape)
data['category'] = data['category'].astype('string')

del ratings_df

# Step 6: convert asin to itemID (0 based)
print("Converting asin to itemID")
items_dict = {}
item_id = 0

with tqdm(total=items_df.shape[0]) as pbar:
    for _, row in items_df.iterrows():
        if row['asin'] not in items_dict:
            items_dict[row['asin']] = item_id
            item_id += 1
        pbar.update(1)
data['itemID'] = data['asin'].apply(lambda x: items_dict[x])

# Step 7: save the itemID dict (for UI)
items_pkl_path = DATA_DIR + f'items_all_{MIN_REVIEWS}.pkl'

with open(items_pkl_path, 'wb') as handle:
    pickle.dump(items_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

del items_dict
data.drop(columns=['asin'], inplace=True) # we will henceforth use itemID

tqdm.pandas()
lst_main_cat = items_df.main_cat.unique()

# Step 8: add genre; 
# currently, consider the main categories or if an item in category matches the main category list
data['genre'] = ''

def update_category(row):
    categories = row['category']
    new_cat = []
    new_cat.append(row['main_cat'])
    # print(categories)
    for cat in categories.split('|'):
        if cat.strip() != '' and (cat in lst_main_cat):
            new_cat.append(cat)
    row['genre'] = '|'.join(new_cat)
    return row
data = data.progress_apply(update_category, axis=1)

# Step 9: save the prepared dataset to be used by wide_deep model
data.drop(columns=['category'], inplace=True)
data.sort_values('unixTimeStamp', inplace=True)
data.reset_index(inplace=True, drop=True)
data = data[['userID','itemID', 'rating','genre','unixTimeStamp','title','price','main_cat','category']]
data.to_csv(DATA_DIR + f'wide_deep_amzn_all_{MIN_REVIEWS}.csv', 
             header=False, index=False)

Converting reviewerID to userID


100%|███████████████████████████████████████████████████████████████████████████████████| 38007219/38007219 [07:52<00:00, 80368.65it/s]


merged dataframe (38007219, 8)
Converting asin to itemID


100%|███████████████████████████████████████████████████████████████████████████████████████| 484062/484062 [00:07<00:00, 62080.04it/s]
