# Prepare for LightGBM

Prepare Electronics dataset for lightGBM model.

#### Input
- wide_deep/Electronics/wide_deep_amzn_e_20.csv, xdeepfm/lst_genres.pkl

#### Output (includes positive & negative samples for NDCG, Hit Rate calculation)
- train_e.csv, valid_e.csv, test_e.csv
- amzn_e_tst_w_neg[0-5].txt

In [1]:
import sys
sys.path.append('../../recommenders') # if needed, adjust the path to Microsoft Recommenders clone

In [2]:
DATA_DIR = '/home/shiv/Documents/DataScience/Capstone/Data/'

In [3]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.utils.constants import (
    DEFAULT_USER_COL as USER_COL,
    DEFAULT_ITEM_COL as ITEM_COL,
    DEFAULT_RATING_COL as RATING_COL,
    DEFAULT_GENRE_COL as ITEM_FEAT_COL,
    DEFAULT_PREDICTION_COL as PREDICT_COL,
    DEFAULT_K,
    DEFAULT_THRESHOLD
)

In [4]:
all_data = pd.read_csv(DATA_DIR + 'wide_deep/Electronics/wide_deep_amzn_e_20.csv', 
                       header=None, low_memory=False)
all_data.columns = [USER_COL,ITEM_COL,RATING_COL,ITEM_FEAT_COL,
                    'unixTimeStamp','title','price','main_cat','category']
all_data.sort_values('unixTimeStamp', inplace=True)
all_data.reset_index(inplace=True, drop=True)

### Split data into train, validation, and test

<b>Need to be done once.</b> And it takes a long time!

First, we cut three sets (train_data (first 80%), valid_data (middle 10%) and test_data (last 10%)), cut from the original all data. <br>
Notably, considering the reviews data is a kind of time-series streaming data, which is also very common in recommendation scenario, we split the data by its order.

In [5]:
train, valid, test = python_chrono_split(all_data, [0.8,0.1,0.1], col_timestamp='unixTimeStamp')

train = train[[RATING_COL,USER_COL,ITEM_COL,ITEM_FEAT_COL]].copy()
valid = valid[[RATING_COL,USER_COL,ITEM_COL,ITEM_FEAT_COL]].copy()
test  = test[[RATING_COL,USER_COL,ITEM_COL,ITEM_FEAT_COL]].copy()

In [7]:
# Get the list of numerical and categorical columns
with open(DATA_DIR + 'xdeepfm/lst_genres.pkl', 'rb') as f:
    lst_genres = pickle.load(f)

cols = [USER_COL, ITEM_COL] # numerical column
for genre in lst_genres:
    cols.append(genre) # categorical columns

In [8]:
# We cannot use column names with spaces in lightGBM
cols_dict = {}
cate_cols = []
cols_dict[USER_COL] = USER_COL
cols_dict[ITEM_COL] = ITEM_COL
rev_cols_dict = {}
for idx, col in enumerate(cols[2:], start=1):
    cols_dict[col] = 'C' + str(idx)
    rev_cols_dict['C'+str(idx)] = col
    cate_cols.append('C' + str(idx))

### Expand genre into categorical columns

In [10]:
for col in cols[2:]:
    train[cols_dict[col]] = np.nan
    valid[cols_dict[col]] = np.nan
    test[cols_dict[col]] = np.nan

In [12]:
tqdm.pandas()

def expand_genres(row):
    genres = row[ITEM_FEAT_COL].split('|')
    for genre in genres:
        row[cols_dict[genre]] = 1
    return row

train = train.progress_apply(expand_genres, axis=1)
valid = valid.progress_apply(expand_genres, axis=1)
test  = test.progress_apply(expand_genres, axis=1)

train.drop(columns=[ITEM_FEAT_COL], inplace=True)
valid.drop(columns=[ITEM_FEAT_COL], inplace=True)
test.drop(columns=[ITEM_FEAT_COL], inplace=True)

# 2 numeric features + 36 categorical features + label
all_data.shape, train.shape, valid.shape, test.shape

100%|██████████████████████████████| 4519730/4519730 [02:37<00:00, 28719.04it/s]
100%|████████████████████████████████| 562550/562550 [00:17<00:00, 31785.65it/s]
100%|████████████████████████████████| 530903/530903 [00:19<00:00, 27531.34it/s]


((5613183, 9), (4519730, 39), (562550, 39), (530903, 39))

In [None]:
train.to_csv(DATA_DIR + 'lightgbm/train_e.csv', header=False, index=False)
valid.to_csv(DATA_DIR + 'lightgbm/valid_e.csv', header=False, index=False)
test.to_csv(DATA_DIR + 'lightgbm/test_e.csv', header=False, index=False)

### Prepare to get ndcg@10, hit@10 for LightGBM

In [None]:
from multiprocessing import Process, Queue
import random

RANDOM_SEED = 42

In [None]:
train_grp = train[[USER_COL,ITEM_COL]].groupby(USER_COL).agg(list)

items_df = train.drop_duplicates([ITEM_COL]).copy()
items_df.drop(columns=[USER_COL, RATING_COL], inplace=True)
items_df.set_index(ITEM_COL, inplace=True)

### Process for generating the test data positive and negative samples

Note that the entire process takes time even when using multiprocessing module.

- Each worker takes a part of the test dataframe, so if there are 5 workers, each get 1/5th of the dataframe; worker # 5 gets to work a bit more to handle the remaining rows in the end
- Once a review is selected, that becomes the positive sample. NUM_NEG_SAMPLES are then found for this user.
- Each of the negative sample is unique and not seen by the user that wrote the review. Both positive and negative samples are written using the required format.

In [None]:
processors = []
items_set = set(train[ITEM_COL].unique())

def sample_function(train_grp, test, items_df, cate_cols, i, num_workers, seed, data_dir):
    nrows = test.shape[0]
    each = nrows // num_workers
    start = i*each
    end = nrows if i == num_workers - 1 else (i+1)*each
    sym = ['!','@','#','$','%']

    # print(i, start, end, test.iloc[start:end].shape)
    random.seed(seed)
    tst_w_neg_samples_path = data_dir + f'lightgbm/amzn_e_tst_w_neg{i}.txt'
    
    user_col = []
    item_col = []
    rating_col = []
    feat_cols = []
    for c in cate_cols:
        feat_cols.append([])
    
    for j, row in test.iloc[start:end].iterrows():
        u = row[USER_COL]
        positive_item = row[ITEM_COL]
        tmp_df = train_grp.loc[u]
        assert(tmp_df.shape[0] != 0)
        
        items_seen_set = set(tmp_df[ITEM_COL])
        items_not_seen = list(items_set - items_seen_set)
        user_col.append(int(u))
        item_col.append(int(positive_item))
        rating_col.append(row[RATING_COL])
        for k, c in enumerate(cate_cols):
            feat_cols[k].append(float(row[c]))

        cnt = 0
        neg_items = set()
        while cnt < NUM_NEG_SAMPLES:
            neg_item = random.choice(items_not_seen)
            if neg_item == positive_item or neg_item in neg_items:
                continue

            cnt += 1
            tmp_df = items_df.loc[neg_item]
            assert(tmp_df.shape[0] != 0)
            
            user_col.append(int(u))
            item_col.append(int(neg_item))
            rating_col.append(5.0) # unused
            for k, c in enumerate(cate_cols):
                feat_cols[k].append(float(tmp_df[c]))
        if j % 10_000 == 0:
            print(sym[i], end='')
            
    test_dict = {USER_COL: user_col, ITEM_COL: item_col, RATING_COL: rating_col}
    for k, c in enumerate(cate_cols):
        test_dict[c] = feat_cols[k]
    X_test = pd.DataFrame(test_dict)
    X_test.to_csv(tst_w_neg_samples_path, header=False, index=False)

for i in range(N_WORKERS):
    processors.append(
        Process(
            target = sample_function,
            args = (train_grp, test, items_df, cate_cols, i, N_WORKERS, RANDOM_SEED, DATA_DIR)
        ))
    # processors[-1].daemon = True
    processors[-1].start()

for i in range(N_WORKERS):
    processors[i].join()