In [11]:
import os
import shutil
import sys

import json

import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn
sn.set()

import pandas as pd
import implicit

import import_ipynb
import evaluation
from tqdm.notebook import tqdm

#   Data preprocessing

In [None]:
# Just choose the name of the dataset directory
dataset  = 'ml-20m'
DATA_DIR = '/Users/tomas/Documents/FEUP/Tese/data/' + dataset

In [None]:
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), header=0)

In [None]:
# binarize the data (only keep ratings >= 4)
raw_data = raw_data[raw_data['rating'] > 3.5]

In [None]:
raw_data.head()

#   Data preprocessing

To have good amount of feedback, item at least with 10 clicks, and users with at least 10 ratings.
I will split the data using the following approach. For each user i will use 80% to train, 10% to validation and 10 % to test. In this way in the future i can have a good foundation to use in my work.  

In [8]:
# Count the number of unique items/users
# returns id, count 
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [9]:
# Triplet: user_id, item_id, rating

def filter_triplets(tp, min_uc=10, min_sc=10):
    
    # Only keep the triplets for items which were clicked on by at least min_sc users (10). 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items (10)
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

In [10]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)

In [None]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

In [None]:
unique_uid = user_activity.index
unique_sid = item_popularity.index 

In [None]:
map_item = dict((sid, i) for (i, sid) in enumerate(unique_sid))
map_user = dict((uid, i) for (i, uid) in enumerate(unique_uid))

In [None]:
def split_train_val_test_proportion(data, val_prop=0.1 ,test_prop=0.1):
    
    # Sort by id and timestamp --> divide
    data = data.sort_values(['userId', 'timestamp'], ascending=[True, True])
    
    data_grouped_by_user = data.groupby('userId')
    tr_list, val_list ,te_list = list(), list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)
        
        train, pre_test = train_test_split(group, test_size=0.2, shuffle=False)
        val, test = train_test_split(pre_test, test_size=0.5, shuffle=False)
        
        tr_list.append(train)
        val_list.append(val)
        te_list.append(test)
        
        if i % 10000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()
        
    data_tr = pd.concat(tr_list)
    data_val = pd.concat(val_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_val, data_te

In [None]:
train, val, test = split_train_val_test_proportion(raw_data)

In [None]:
PARSE_DATA_DIR = os.path.join(DATA_DIR, 'processed')

if not os.path.exists(PARSE_DATA_DIR):
    os.makedirs(PARSE_DATA_DIR)

In [None]:
def numerize(tp):
    uid = list(map(lambda x: map_user[x], tp['userId']))
    sid = list(map(lambda x: map_item[x], tp['movieId']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [None]:
train_data = numerize(train)
train_data.to_csv(os.path.join(PARSE_DATA_DIR, 'train.csv'), index=False)

In [None]:
val_data = numerize(val)
val_data.to_csv(os.path.join(PARSE_DATA_DIR, 'validation.csv'), index=False)

In [None]:
test_data = numerize(test)
test_data.to_csv(os.path.join(PARSE_DATA_DIR, 'test.csv'), index=False)

In [None]:
#save the mappings to later use them to convert
with open(os.path.join(PARSE_DATA_DIR, 'map_user.json'), 'w') as fp:
    json.dump(map_user, fp)

In [None]:
with open(os.path.join(PARSE_DATA_DIR, 'map_item.json'), 'w') as fp:
    json.dump(map_item, fp)

In [None]:
with open(os.path.join(PARSE_DATA_DIR, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [None]:
with open(os.path.join(PARSE_DATA_DIR, 'unique_uid.txt'), 'w') as f:
    for sid in unique_uid:
        f.write('%s\n' % sid)