In [2]:
import numpy as np
import pandas as pd 
import pickle
import gzip
# gzip for reading the compressed file
from time import time
from pathlib import Path
from scipy.sparse import save_npz
# save_npz for saving the sparse matrix
from joblib import Parallel, delayed
# Parallel for parallel computing
import os

In [3]:
data_path=Path("data")
print(data_path)

data


In [4]:
reviews =data_path/'reviews_Movies_and_TV_5.json.gz'
print(reviews)

data/reviews_Movies_and_TV_5.json.gz


In [14]:
df = pd.read_json(reviews, lines=True)
keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
new_colnames = ['user', 'item', 'timestamp', 'rating']
df = df[keep_cols]
df.columns = new_colnames
df.head()

Unnamed: 0,user,item,timestamp,rating
0,ADZPIG9QOCDG5,5019281,1203984000,4
1,A35947ZP82G7JH,5019281,1388361600,3
2,A3UORV8A9D5L2E,5019281,1388361600,3
3,A1VKW06X1O2X7V,5019281,1202860800,5
4,A3R27T4HADWFFJ,5019281,1387670400,4


In [15]:
df.rating.value_counts()

rating
5    906608
4    382994
3    201302
1    104219
2    102410
Name: count, dtype: int64

In [16]:
# (temporal) rank of items bought
df['rank'] = df.groupby("user")["timestamp"].rank(ascending=True, method='dense')
df.drop("timestamp", axis=1, inplace=True)
df.head()

Unnamed: 0,user,item,rating,rank
0,ADZPIG9QOCDG5,5019281,4,2.0
1,A35947ZP82G7JH,5019281,3,1.0
2,A3UORV8A9D5L2E,5019281,3,3.0
3,A1VKW06X1O2X7V,5019281,5,1.0
4,A3R27T4HADWFFJ,5019281,4,2.0


In [17]:
df.head(20)

Unnamed: 0,user,item,rating,rank
0,ADZPIG9QOCDG5,5019281,4,2.0
1,A35947ZP82G7JH,5019281,3,1.0
2,A3UORV8A9D5L2E,5019281,3,3.0
3,A1VKW06X1O2X7V,5019281,5,1.0
4,A3R27T4HADWFFJ,5019281,4,2.0
5,A2L0G56BNOTX6S,5019281,5,4.0
6,A5NYUBEKXFLX5,5019281,5,7.0
7,A2DJ8B8GE4V2VD,5019281,5,2.0
8,AWF2S3UNW9UA0,5019281,5,23.0
9,A3O4UUT83DG3OU,5019281,5,2.0


In [18]:
# mapping user and item ids to (continuos) integers
user_mappings = {k:v for v,k in enumerate(df.user.unique())}
item_mappings = {k:v for v,k in enumerate(df.item.unique())}
df['user'] = df['user'].map(user_mappings)
df['item'] = df['item'].map(item_mappings)
df = df[['user','item','rank','rating']].astype(np.int64)
n_users = df.user.nunique()
n_items = df.item.nunique()
df.head()

Unnamed: 0,user,item,rank,rating
0,0,0,2,4
1,1,0,1,3
2,2,0,3,3
3,3,0,1,5
4,4,0,2,4


In [19]:
dfc = df.copy()
dfc.sort_values(['user','rank'], ascending=[True,True], inplace=True)
dfc.reset_index(inplace=True, drop=True)

# use last ratings for testing and all the previous for training
test = dfc.groupby('user').tail(1)
train = pd.merge(dfc, test, on=['user','item'],
    how='outer', suffixes=('', '_y'))
train = train[train.rating_y.isnull()]
test = test[['user','item','rating']]
train = train[['user','item','rating']]
print(train.shape, test.shape)

(1573573, 3) (123960, 3)


In [20]:
# select 99 random movies per user that were never rated by that user
all_items = dfc.item.unique()
rated_items = (dfc.groupby("user")['item']
    .apply(list)
    .reset_index()
    ).item.tolist()

def sample_not_rated(item_list, rseed=1, n=99):
    np.random.seed=rseed
    return np.random.choice(np.setdiff1d(all_items, item_list), n)

print("sampling not rated items...")
start = time()
non_rated_items = Parallel(n_jobs=4)(delayed(sample_not_rated)(ri) for ri in rated_items)
end = time() - start
print("sampling took {} min".format(round(end/60,2)))

negative = pd.DataFrame({'negative':non_rated_items})
negative[['item_n'+str(i) for i in range(99)]] =\
    pd.DataFrame(negative.negative.values.tolist(), index= negative.index)
negative.drop('negative', axis=1, inplace=True)
negative = negative.stack().reset_index()
negative = negative.iloc[:, [0,2]]
negative.columns = ['user','item']
negative['rating'] = 0
assert negative.shape[0] == len(non_rated_items)*99
test_negative = (pd.concat([test,negative])
    .sort_values('user', ascending=True)
    .reset_index(drop=True)
    )
# Ensuring that the 1st element every 100 is the rated item. This is
# fundamental for testing
test_negative.sort_values(['user', 'rating'], ascending=[True,False], inplace=True)
assert np.all(test_negative.values[0::100][:,2] != 0)

sampling not rated items...
sampling took 1.81 min


In [21]:
user_id = np.random.randint(0, n_users-1)
items_rated = test_negative[(test_negative.user==user_id) & (test_negative.rating != 0)]['item'].tolist()
items_rated+= train[train.user==user_id]['item'].tolist()
items_never_rated = test_negative[(test_negative.user==user_id) & (test_negative.rating == 0)]['item'].tolist()
assert len(np.intersect1d(items_rated, items_never_rated)) == 0

In [24]:
import scipy.sparse as sp

In [25]:
def array2mtx(interactions):
    num_users = interactions[:,0].max()
    num_items = interactions[:,1].max()
    mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
    for user, item, rating in interactions:
            mat[user, item] = rating
    return mat.tocsr()

In [26]:
print("saving training set as sparse matrix...")
train_mtx = array2mtx(train.values)

saving training set as sparse matrix...


In [27]:
# Save
np.savez(data_path/"neuralcf_split.npz", train=train.values, test=test.values,
    test_negative=test_negative.values, negatives=np.array(non_rated_items),
    n_users=n_users, n_items=n_items)

# Save training as sparse matrix
print("saving training set as sparse matrix...")
train_mtx = array2mtx(train.values)
save_npz(data_path/"neuralcf_train_sparse.npz", train_mtx)

saving training set as sparse matrix...
