# Prepare dataset for xDeepFM

<b>xDeepFM</b> uses the Field-aware Factorization Machine (FFM) format as data input where each row in the dataset has the following: `<label> <field_index_id>:<feature_index_id>:<feature_value>`  
Each line represents an instance, `<label>` is a rating value followed by numerical features (userID, itemID)  and categorical features in the FFM format.<br>

Features are divided into fields. For example, user's gender is a field, it contains three possible values, i.e. male, female and unknown. Occupation can be another field, which contains many more possible values than the gender field. <b>Both field index and feature index are starting from 1</b>.

In [2]:
import sys
sys.path.append('../../')

## Read the reviews dataset (same one as used by wide_n_deep model)

In [3]:
import pandas as pd
from recommenders.utils.constants import (
    DEFAULT_USER_COL as USER_COL,
    DEFAULT_ITEM_COL as ITEM_COL,
    DEFAULT_RATING_COL as RATING_COL,
    DEFAULT_GENRE_COL as ITEM_FEAT_COL,
)

DATA_DIR = '/home/shiv/Documents/DataScience/Capstone/Data/'

data = pd.read_csv(DATA_DIR + 'wide_deep/Electronics/wide_deep_amzn_e_20.csv', header=None, low_memory=False)
data.columns = [USER_COL,ITEM_COL,RATING_COL,ITEM_FEAT_COL,
                'unixTimeStamp','title','price','main_cat','category']

# data = data[[USER_COL,ITEM_COL,RATING_COL,ITEM_FEAT_COL]]

In [4]:
# data = data.sample(frac=1., random_state=42)
data.sort_values('unixTimeStamp', inplace=True)

In [5]:
data.reset_index(inplace=True, drop=True)

In [6]:
data.shape, data.head()

((5613183, 9),
    userID  itemID  rating                                              genre  \
 0   81212    5424     5.0  Car Electronics|Electronics|Portable Audio & V...   
 1  277508    5407     5.0  All Electronics|Electronics|Computers & Access...   
 2  281405    5462     4.0  All Electronics|Electronics|Computers & Access...   
 3  277481    5404     4.0  Home Audio & Theater|Electronics|Portable Audi...   
 4  215430    5463     3.0  All Electronics|Electronics|Computers & Access...   
 
    unixTimeStamp                                              title   price  \
 0      947462400  Motorola 53725 SLK Headset with Swivel Boom Mi...     $$$   
 1      958262400           Belkin CAT5e 3-Feet Cat 5E Network Cable   $5.07   
 2      959299200  Cisco-Linksys BEFSR41 EtherFast Cable/DSL Rout...   $2.46   
 3      959299200  GE 72887 Superadio III Portable AM/FM Radio (D...     $$$   
 4      961200000  D-Link DFE-530TX+ 10/100 Fast Ethernet Desktop...  $12.99   
 
               

## Find out all the genres used (for categorical columns)

- USER_COL, ITEM_COL are numeric columns

In [6]:
from tqdm import tqdm

cols = [USER_COL, ITEM_COL] # label is rating, USER_COL, ITEM_COL followed by the categorical genres
genres_set = set()

with tqdm(total=data.shape[0]) as pbar:
    for _, row in data.iterrows():
        genres = row[ITEM_FEAT_COL]
        genres = genres.strip()
        assert((genres != '') and (len(genres) > 0))
        genres = genres.split('|')
        for genre in genres:
            genres_set.add(genre)
        pbar.update(1)

100%|█████████████████████████████████████████████████████████████████████████████████████| 5613183/5613183 [01:11<00:00, 79034.27it/s]


In [7]:
lst_genres = list(genres_set)
lst_genres = sorted(lst_genres)
        
for genre in lst_genres:
    cols.append(genre)
print(cols)

['userID', 'itemID', 'Accessories', 'Accessories & Supplies', 'All Electronics', 'Amazon Devices', 'Apple Products', 'Audio & Video Accessories', 'Automotive', 'Camera & Photo', 'Car & Vehicle Electronics', 'Car Electronics', 'Cell Phones & Accessories', 'Clothing, Shoes & Jewelry', 'Computer Accessories & Peripherals', 'Computer Components', 'Computers', 'Computers & Accessories', 'Controllers', 'Electrical', 'Electronics', 'GPS & Navigation', 'Home & Kitchen', 'Home Audio', 'Home Audio & Theater', 'Industrial & Scientific', 'Laptop Accessories', 'Musical Instruments', 'Office & School Supplies', 'Office Electronics', 'Office Products', 'Portable Audio & Accessories', 'Portable Audio & Video', 'Sports & Fitness', 'Sports & Outdoors', 'Tools & Home Improvement', 'Toys & Games', 'Video Games']


In [8]:
import pickle
# Save the two dictionaries for model serving
with open(DATA_DIR + 'xdeepfm/lst_genres.pkl', 'wb') as f:
    pickle.dump(lst_genres, f, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
import pickle
# Save the two dictionaries for model serving
with open(DATA_DIR + 'xdeepfm/lst_genres.pkl', 'rb') as f:
    lst_genres = pickle.load(f)
cols = [USER_COL, ITEM_COL]
for genre in lst_genres:
    cols.append(genre)

## Prepare the feature set

- Ref: https://github.com/Leavingseason/xDeepFM/tree/master/exdeepfm

In [9]:
from collections import defaultdict
cat_feat_cnt = defaultdict(int)

with tqdm(total=data.shape[0]) as pbar:
    for _, row in data.iterrows():
        genres = row[ITEM_FEAT_COL]
        genres = genres.strip()
        assert((genres != '') and (len(genres) > 0))
        genres = genres.split('|')

        for col in cols[2:]:
            if col not in genres:
                cat_feat_cnt[col+'#absent'] += 1
            else:
                cat_feat_cnt[col+'#1'] += 1
        pbar.update(1)

100%|█████████████████████████████████████████████████████████████████████████████████████| 5613183/5613183 [01:42<00:00, 55009.23it/s]


In [10]:
import math

feat_set = set()
T = 4

with tqdm(total=data.shape[0]) as pbar:
    for _, row in data.iterrows():
        user = row[USER_COL]
        item = row[ITEM_COL]

        feat_set.add('user#'+str(user))
        feat_set.add('item#'+str(item))

        genres = row[ITEM_FEAT_COL]
        genres = genres.strip()
        assert((genres != '') and (len(genres) > 0))
        genres = genres.split('|')

        for col in cols[2:]:
            if col not in genres:
                feat_set.add(col+'#absent')
            else:
                feat = col + '#1'
                assert(cat_feat_cnt[feat] > T) # we don't need to worry about features that occur infrequently
                feat_set.add(feat)
        pbar.update(1)

100%|█████████████████████████████████████████████████████████████████████████████████████| 5613183/5613183 [02:02<00:00, 45844.17it/s]


In [11]:
len(feat_set)

894465

In [12]:
import pickle
# Save the two dictionaries for model serving
with open(DATA_DIR + 'xdeepfm/feat_set.pkl', 'wb') as f:
    pickle.dump(feat_set, f, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
with open(DATA_DIR + 'xdeepfm/feat_set.pkl', 'rb') as f:
    feat_set = pickle.load(f)

## Split into train, valid, test using python_chrono_split

In [10]:
from recommenders.datasets.python_splitters import python_chrono_split

In [11]:
train, valid, test = python_chrono_split(data, [0.8,0.1,0.1], col_timestamp='unixTimeStamp')

In [12]:
train.shape, valid.shape, test.shape

((4519730, 9), (562550, 9), (530903, 9))

#### Using python_chrono_split has a better distribution

- Note that we don't have enough reviewers that have given more than two reviews.

In [16]:
train['userID'].nunique(), valid['userID'].nunique(), test['userID'].nunique(), 

(830668, 426100, 418307)

In [17]:
train['itemID'].nunique(), valid['itemID'].nunique(), test['itemID'].nunique(), 

(63725, 57569, 53917)

#### When compared to simply chopping off the datasets into 80:10:10 split

In [18]:
total_line_cnt = data.shape[0]
train_line_cnt = int(0.8*total_line_cnt)
val_line_cnt = int(0.9*total_line_cnt)
print("total", total_line_cnt, train_line_cnt, val_line_cnt) 

total 5613183 4490546 5051864


In [20]:
data.iloc[:train_line_cnt]['userID'].nunique(), data.iloc[:train_line_cnt]['itemID'].nunique()

(759281, 63314)

In [22]:
data.iloc[train_line_cnt:val_line_cnt]['userID'].nunique(), data.iloc[train_line_cnt:val_line_cnt]['itemID'].nunique()

(248228, 48176)

In [23]:
data.iloc[val_line_cnt:]['userID'].nunique(), data.iloc[val_line_cnt:]['itemID'].nunique()

(250479, 44203)

### Back to building the FFM format for train, valid, test

In [18]:
feat_index = dict()
for index, feat in enumerate(feat_set, start=1):
    feat_index[feat] = index
    # print(index, feat)
print('feat dict num:', len(feat_index))

feat dict num: 894465


In [19]:
field_index = dict()
field_list = cols

for index, field in enumerate(field_list, start=1):
    field_index[field] = index
print('field dict num:', len(field_index))

field dict num: 38


In [20]:
import pickle
# Save the two dictionaries for model serving
with open(DATA_DIR + 'xdeepfm/feat_index.pkl', 'wb') as f:
    pickle.dump(feat_index, f, protocol=pickle.HIGHEST_PROTOCOL)
with open(DATA_DIR + 'xdeepfm/field_index.pkl', 'wb') as f:
    pickle.dump(field_index, f, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
with open(DATA_DIR + 'xdeepfm/feat_index.pkl', 'rb') as f:
    feat_index = pickle.load(f)
with open(DATA_DIR + 'xdeepfm/field_index.pkl', 'rb') as f:
    field_index = pickle.load(f)

In [21]:
dst_train_path = DATA_DIR + 'xdeepfm/amzn_e_train.txt'
dst_valid_path = DATA_DIR + 'xdeepfm/amzn_e_valid.txt'
dst_test_path =  DATA_DIR + 'xdeepfm/amzn_e_test.txt'

def gen_ffm_data(path, df):
    out_f = open(path, 'w')
    with tqdm(total = df.shape[0]) as pbar:
        for _,row in df.iterrows():
            feats = []
            feats.append(str(int(row['rating'])))

            feats.append(str(field_index[USER_COL]) + ':' + str(feat_index['user#' + str(row[USER_COL])]) + ':1')
            feats.append(str(field_index[ITEM_COL]) + ':' + str(feat_index['item#' + str(row[ITEM_COL])]) + ':1')

            genres = row[ITEM_FEAT_COL]
            genres = genres.strip()
            assert((genres != '') and (len(genres) > 0))
            genres = genres.split('|')

            for col in cols[2:]:
                if col not in genres:
                    feat = col + '#absent'
                else:
                    feat = col + '#1'
                feats.append(str(field_index[col]) + ':' + str(feat_index[feat]) + ':1')

            out_f.write(' '.join(feats) + '\n')
            pbar.update(1)
    out_f.close()
gen_ffm_data(dst_train_path, train)
gen_ffm_data(dst_valid_path, valid)
gen_ffm_data(dst_test_path, test)

100%|█████████████████████████████████████████████████████████████████████████████████████| 4519730/4519730 [02:25<00:00, 31025.90it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 562550/562550 [00:18<00:00, 30613.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 530903/530903 [00:17<00:00, 30781.72it/s]


## Generate the test data samples for ndcg, hit rate

In [13]:
train_grp = train[['userID', 'itemID']].groupby('userID').agg(list)
# valid_grp = valid[['userID', 'itemID']].groupby('userID', as_index=False).agg(list)
all_items_set = set(data['itemID'].unique())

items_df = train[[ITEM_COL, ITEM_FEAT_COL]].drop_duplicates([ITEM_COL]).copy()
items_df.set_index(ITEM_COL, inplace=True)

In [14]:
from multiprocessing import Process, Queue
import random

NUM_NEG_SAMPLES = 50
RANDOM_SEED = 42
N_WORKERS = 5

In [18]:
processors = []

def sample_function(train_grp, test, items_df, field_index, feat_index, cols, i, num_workers, seed, data_dir):
    nrows = test.shape[0]
    each = nrows // num_workers
    start = i*each
    end = nrows if i == num_workers - 1 else (i+1)*each
    sym = ['!','@','#','$','%']

    print(i, start, end, test.iloc[start:end].shape)
    random.seed(seed)
    tst_w_neg_samples_path = data_dir + f'xdeepfm/amzn_e_tst_w_neg{i}.txt'
    out_f = open(tst_w_neg_samples_path, 'w')
    
    j = 0
    line_cnt = 0
    for _, row in test.iloc[start:end].iterrows():
        u = row[USER_COL]
        positive_item = row[ITEM_COL]

        feats = []
        feats.append(str(int(row['rating'])))

        feats.append(str(field_index[USER_COL]) + ':' + str(feat_index['user#' + str(u)]) + ':1')
        feats.append(str(field_index[ITEM_COL]) + ':' + str(feat_index['item#' + str(positive_item)]) + ':1')

        genres = row[ITEM_FEAT_COL]
        genres = genres.strip()
        assert((genres != '') and (len(genres) > 0))
        genres = genres.split('|')

        for col in cols[2:]:
            if col not in genres:
                feat = col + '#absent'
            else:
                feat = col + '#1'
            feats.append(str(field_index[col]) + ':' + str(feat_index[feat]) + ':1')
        out_f.write(' '.join(feats) + '\n')
        line_cnt += 1
        
        items_seen_set = set(train_grp.loc[u][ITEM_COL])
        items_not_seen_lst = list(all_items_set - items_seen_set)
        
        cnt = 0
        neg_items = set()
        while cnt < NUM_NEG_SAMPLES:
            neg_item = random.choice(items_not_seen_lst)
            if neg_item == positive_item or neg_item in neg_items:
                continue

            cnt += 1
            neg_items.add(neg_item)
            
            feats = []
            feats.append(str(0))
            
            feats.append(str(field_index[USER_COL]) + ':' + 
                         str(feat_index['user#' + str(u)]) + ':1')
            feats.append(str(field_index[ITEM_COL]) + ':' + 
                         str(feat_index['item#' + str(neg_item)]) + ':1')
            
            item_data = items_df.loc[neg_item]
            assert(item_data.shape[0] != 0)
            
            genres = item_data[ITEM_FEAT_COL]
            genres = genres.strip()
            assert((genres != '') and (len(genres) > 0))
            genres = genres.split('|')

            for col in cols[2:]:
                if col not in genres:
                    feat = col + '#absent'
                else:
                    feat = col + '#1'
                feats.append(str(field_index[col]) + ':' + str(feat_index[feat]) + ':1')
            out_f.write(' '.join(feats) + '\n')
            line_cnt += 1
        j += 1
        if j % 10_000 == 0:
            assert(line_cnt == j*51)
            print(sym[i], end='')
        
    out_f.close()    
    
    
for i in range(N_WORKERS):
    processors.append(
        Process(
            target = sample_function,
            args = (train_grp, test, items_df, 
                    field_index, feat_index, cols, i, N_WORKERS, RANDOM_SEED, DATA_DIR)
        ))
    # processors[-1].daemon = True
    processors[-1].start()
for i in range(N_WORKERS):
    processors[i].join()

0 10  106180106180 (106180, 9) 2
 212360 3212360(106180, 9)  
318540318540  (106180, 9)424720
4  (106180, 9)424720 
530903 (106183, 9)
$@%#!%#@!$#%!@$#@%!$#@!$%!#@$%!#@$%#!@$%!#@$%!#@$%

In [55]:
test.shape

(530903, 9)

# Old code!!!

In [16]:
line_cnt = 0

dst_train_path = DATA_DIR + 'xdeepfm/amzn_e_train.txt'
dst_valid_path = DATA_DIR + 'xdeepfm/amzn_e_valid.txt'
dst_test_path =  DATA_DIR + 'xdeepfm/amzn_e_test.txt'

out_train = open(dst_train_path, 'w')
out_valid = open(dst_valid_path, 'w')
out_test = open(dst_test_path, 'w')
out = out_train

for _,row in data.iterrows():
    feats = []
    feats.append(str(int(row['rating'])))

    feats.append(str(field_index[USER_COL]) + ':' + str(feat_index['user#' + str(row[USER_COL])]) + ':1')
    feats.append(str(field_index[ITEM_COL]) + ':' + str(feat_index['item#' + str(row[ITEM_COL])]) + ':1')

    genres = row[ITEM_FEAT_COL]
    genres = genres.strip()
    assert((genres != '') and (len(genres) > 0))
    genres = genres.split('|')
    
    for col in cols[2:]:
        if col not in genres:
            feat = col + '#absent'
        else:
            feat = col + '#1'
        feats.append(str(field_index[col]) + ':' + str(feat_index[feat]) + ':1')
  
    out.write(' '.join(feats) + '\n')
        
    line_cnt += 1
    if line_cnt % 100000 == 0:
        if line_cnt %1000000 == 0:
            print('!',end='')
        else:
            print('*', end='')
    if (line_cnt >= train_line_cnt) and (out == out_train):
        out = out_valid
    elif (line_cnt >= val_line_cnt) and (out == out_valid):
        out = out_test

out_train.close()
out_valid.close()
out_test.close()

*********!*********!*********!*********!*********!******

In [18]:
users = data.drop_duplicates(USER_COL)[[USER_COL]].reset_index(drop=True)
users.shape

(830668, 1)

In [19]:
items = data.drop_duplicates(ITEM_COL)[[ITEM_COL, ITEM_FEAT_COL]].reset_index(drop=True)
items.shape

(63725, 2)

## Test set for ranking (only for movielens)

In [16]:
users = data.drop_duplicates(USER_COL)[[USER_COL]].reset_index(drop=True)
users.shape

(943, 1)

In [17]:
items = data.drop_duplicates(ITEM_COL)[[ITEM_COL, ITEM_FEAT_COL]].reset_index(drop=True)
items.shape

(1682, 2)

### Following merge will fail for Electronics dataset!

In [19]:
users["key"] = 1
items["key"] = 1
users_items = users.merge(items, on="key")

users.drop("key", axis=1, inplace=True)
items.drop("key", axis=1, inplace=True)
users_items.drop("key", axis=1, inplace=True)

users_items

Unnamed: 0,userID,itemID,genre
0,31,682,Horror|Mystery|Thriller
1,31,15,Drama
2,31,317,Drama
3,31,678,Drama|Thriller
4,31,66,Comedy|Romance
...,...,...,...
1586121,107,1579,Thriller
1586122,107,830,Action|Mystery|Thriller
1586123,107,1569,Comedy|Drama
1586124,107,1236,Drama


In [20]:
users_items = users_items.loc[
        ~users_items.set_index([USER_COL, ITEM_COL]).index.isin(
            data.set_index([USER_COL, ITEM_COL]).index
        )]

In [21]:
users_items

Unnamed: 0,userID,itemID,genre
1,31,15,Drama
2,31,317,Drama
3,31,678,Drama|Thriller
4,31,66,Comedy|Romance
5,31,88,Comedy|Romance
...,...,...,...
1586121,107,1579,Thriller
1586122,107,830,Action|Mystery|Thriller
1586123,107,1569,Comedy|Drama
1586124,107,1236,Drama


In [23]:
dst_test_nolabel = DATA_DIR + 'movielens_100k_test_no_label.txt'

out = open(dst_test_nolabel, 'w')

for _,row in users_items.iterrows():
    feats = []

    feats.append(str(field_index[USER_COL]) + ':' + str(feat_index['user#' + str(row[USER_COL])]) + ':1')
    feats.append(str(field_index[ITEM_COL]) + ':' + str(feat_index['item#' + str(row[ITEM_COL])]) + ':1')

    genres = row[ITEM_FEAT_COL]
    genres = genres.strip()
    assert((genres != '') and (len(genres) > 0))
    genres = genres.split('|')
    
    for col in cols[2:]:
        if col not in genres:
            feat = col + '#absent'
        else:
            feat = col + '#1'
        feats.append(str(field_index[col]) + ':' + str(feat_index[feat]) + ':1')
  
    out.write(' '.join(feats) + '\n')
        
    line_cnt += 1
#     if line_cnt % 100000 == 0:
#         if line_cnt %1000000 == 0:
#             print('!',end='')
#         else:
#             print('*', end='')


out.close()
