In [122]:
import os
import shutil
import sys

import numpy as np
from scipy import sparse


import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn
sn.set()

import pandas as pd
import implicit

#   Data preprocessing

In [118]:
# Just choose the name of the dataset directory
dataset  = 'ml-20m'
DATA_DIR = '/Users/tomas/Documents/FEUP/Tese/data/' + dataset

Code for movie lens

In [119]:
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), header=0)

In [120]:
# binarize the data (only keep ratings >= 4)
raw_data = raw_data[raw_data['rating'] > 3.5]

In [121]:
raw_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826
10,1,293,4.0,1112484703


## Data spliting

In [123]:
# Count the number of unique items/users
# returns id, count 
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [126]:
# Triplet: user_id, item_id, rating

def filter_triplets(tp, min_uc=5, min_sc=5):
    
    # Only keep the triplets for items which were clicked on by at least min_sc users (5). 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items (5)
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

In [127]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)

In [128]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 9977455 watching events from 136674 users and 13681 movies (sparsity: 0.534%)


In [132]:
unique_uid

Int64Index([ 76157,  11200,  62088,  76768,  90791,  52511,  76921,   1617,
             58162,  93045,
            ...
            121719, 110463,  20837,  37745, 117130,   2082,  38310, 128557,
             70001, 136726],
           dtype='int64', name='userId', length=136674)

In [131]:
#randomize the users

unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [18]:
unique_uid.size

136674

In [19]:
# create train/validation/test users
n_users = unique_uid.size
n_heldout_users = 10000   

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [20]:
# Train triplets
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]

In [21]:
#movies id
unique_sid = pd.unique(train_plays['movieId'])

In [22]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [23]:
pro_dir = os.path.join(DATA_DIR, 'pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [24]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [25]:
#validation triplets
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

In [26]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [27]:
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

In [28]:
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


## Save the data into (user_index, item_index) format

In [29]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['userId']))
    sid = list(map(lambda x: show2id[x], tp['movieId']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [30]:
train_data = numerize(train_plays)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

In [31]:
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

In [32]:
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

In [33]:
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

In [34]:
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

# Load Data

In [35]:
unique_sid = list()
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

n_items = len(unique_sid)

In [36]:
def load_csv_data(csv_file):
    tp = pd.read_csv(csv_file)
    n_users = tp['uid'].max() + 1

    rows, cols = tp['uid'], tp['sid']
    data = sparse.csr_matrix((np.ones_like(rows),
                             (rows, cols)), dtype='float64',
                             shape=(n_users, n_items))
    return data

In [37]:
#user - items
train_data = load_train_data(os.path.join(pro_dir, 'train.csv'))

In [38]:
def load_tr_te_data(csv_file_tr, csv_file_te):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())
    
    print(start_idx, end_idx)
    
    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']
    
    data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                             (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te),
                             (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    
    users_mapping = {(v - start_idx): v  for v in range(start_idx, end_idx + 1)}
                     
    return data_tr, data_te, users_mapping

In [39]:
vad_data_tr, vad_data_te, vad_users_mapping = load_tr_te_data(os.path.join(pro_dir, 'validation_tr.csv'),
                                           os.path.join(pro_dir, 'validation_te.csv'))

116674 126673


# Modeling

In [40]:
K = 10
NUM_THREADS = 0
TOP_N = 30

In [None]:
train_data

In [41]:
model = implicit.als.AlternatingLeastSquares(factors=50)



In [50]:
model.fit(train_data.T)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [113]:
pos = 10
test = sparse.vstack([train_data, vad_data_tr[pos,:]]) 

In [114]:
user_id = 116674
start_idx = 116674
end_idx = 126673
mean = 0

for  
    cont = 0
    recommendations = model.recommend(user_id, 
                                test,
                                N=TOP_N, 
                                filter_already_liked_items=False,
                                recalculate_user=True)
    for r in recommendations:
        cont = cont + vad_data_te[pos,r[0]]
    

In [115]:
vad_data_te[0,]

<1x13681 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [116]:
cont = 0
for r in recommendations:
    cont = cont + vad_data_te[pos,r[0]] 
print(cont)

3.0


In [117]:
recommendations

[(3, 0.5182517805525257),
 (97, 0.40441010234276853),
 (262, 0.31173574727114584),
 (367, 0.30080538447986216),
 (391, 0.24908743362438052),
 (100, 0.24377269530021356),
 (89, 0.22470212589669541),
 (12, 0.20217921991250645),
 (261, 0.20051677230520004),
 (366, 0.2003145969581941),
 (125, 0.1968702544681089),
 (1384, 0.19202950170576377),
 (387, 0.18697548630300487),
 (1385, 0.1823624824414341),
 (263, 0.1728205282495481),
 (384, 0.1706882152961071),
 (236, 0.16026899419574492),
 (390, 0.15937410354100273),
 (895, 0.15725353079709453),
 (632, 0.1561288358919237),
 (270, 0.14485180568833392),
 (1002, 0.14220442142769252),
 (141, 0.1406154104205687),
 (1349, 0.13611346412621764),
 (630, 0.132893455677968),
 (388, 0.13169382757034176),
 (135, 0.1207610753054647),
 (525, 0.12017194864224941),
 (376, 0.1196884986329884),
 (892, 0.1118182512960151)]

In [101]:
ranked = model.rank_items(116674, 
                    test,
                    selected_items=unique_sid,
                    recalculate_user=True)

TypeError: '>=' not supported between instances of 'str' and 'int'

In [57]:
vad_data_tr[1,:]

<1x13681 sparse matrix of type '<class 'numpy.float64'>'
	with 43 stored elements in Compressed Sparse Row format>

In [105]:
train_data.T

<13681x116674 sparse matrix of type '<class 'numpy.float64'>'
	with 8512951 stored elements in Compressed Sparse Column format>

In [106]:
cos.fit(train_data.T)

HBox(children=(IntProgress(value=0, max=13681), HTML(value='')))




In [108]:
test = sparse.vstack([train_data, vad_data_tr[2,:]]) 
test.shape

(116675, 13681)

In [131]:
cos.fit(test.T)

HBox(children=(IntProgress(value=0, max=116675), HTML(value='')))




In [130]:
 cos.recommend(0, 
                vad_data_tr[1,:],
                N=TOP_N, 
                filter_already_liked_items=True,
                recalculate_user=False)

[(5, 6.013750837856843),
 (387, 3.6931728167290436),
 (6, 3.3008976572795548),
 (7, 3.12785700325313),
 (13, 2.7170667931871346),
 (347, 2.4758767533003256),
 (154, 2.4143602125064363),
 (278, 2.0510347275739),
 (12, 1.9560754474338888),
 (131, 1.8244069315699227),
 (268, 1.7844022668235096),
 (151, 1.7376265068360448),
 (539, 1.7007045516851056),
 (94, 1.6976795563412812),
 (908, 1.5861102228799977),
 (272, 1.5703297045272064),
 (157, 1.4426008693337415),
 (15, 1.4218464639916404),
 (1786, 1.39906069576888),
 (95, 1.363323248734571),
 (503, 1.2622189790994196),
 (156, 1.195059348447018),
 (137, 1.022684646786681),
 (391, 0.9688042100068874),
 (17, 0.9151679081280699),
 (2785, 0.8923030399139336),
 (46, 0.8833626777620167),
 (269, 0.8811720959463196),
 (442, 0.8736795210608341),
 (98, 0.8681776051124851)]

In [119]:
cos.recommend(116674, test.T)

[(5, 6.013750837856843),
 (387, 3.6931728167290436),
 (6, 3.3008976572795548),
 (7, 3.12785700325313),
 (13, 2.7170667931871346),
 (347, 2.4758767533003256),
 (154, 2.4143602125064363),
 (278, 2.0510347275739),
 (12, 1.9560754474338888),
 (131, 1.8244069315699227)]