This notebook has been used to create sparse training, testing and validation datsets

In [10]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import os
import sys
import numpy as np
from scipy import sparse
import pandas as pd
import pickle

def save_weights_pkl(fname, weights):
    with open(fname, 'wb') as f:
        pickle.dump(weights, f, pickle.HIGHEST_PROTOCOL)

def load_weights_pkl(fname):
    with open(fname, 'rb') as f:
        weights = pickle.load(f)
    return weights


rating_file = 'ml-20m/ratings.csv'
out_data_dir = 'ml_dataset'

movie_emb_f = 'movie_emb_multilingual.pkl'
movie_emb_path = 'ml_dataset/'
new_movie_emb_path = movie_emb_path +'movie_emb_multilingual.dic'


In [2]:
os.makedirs(out_data_dir, exist_ok=True)
raw_data_orig = pd.read_csv(rating_file, sep=',', header=0)

# you are setting the value of movieid to its index after dropping duplicates, seems unnessecary
new_movie_id = raw_data_orig['movieId'].drop_duplicates().reset_index().rename(columns={"index": "new_movieId"})
new_user_id = raw_data_orig['userId'].drop_duplicates().reset_index().rename(columns={"index": "new_userId"})

In [3]:
new_movie_id

Unnamed: 0,new_movieId,movieId
0,0,2
1,1,29
2,2,32
3,3,47
4,4,50
...,...,...
26739,19975144,121017
26740,19975145,121019
26741,19975146,121021
26742,19989073,110167


In [3]:
raw_data_full = raw_data_orig.merge(new_user_id, on='userId', how='left').merge(new_movie_id, on='movieId', how='left')
movie_id_map = raw_data_full[['movieId','new_movieId' ]].drop_duplicates() # create a map between new and old movieId
user_id_map = raw_data_full[['userId','new_userId' ]].drop_duplicates() # create a map between new and old movieId
raw_data_full = raw_data_full.drop(labels=['userId', 'movieId', 'timestamp'], axis=1).rename(columns={"new_userId": "userId", "new_movieId": "movieId"})
raw_data_full.head()

Unnamed: 0,rating,userId,movieId
0,3.5,0,0
1,3.5,0,1
2,3.5,0,2
3,3.5,0,3
4,3.5,0,4


In [4]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=True)
    count = playcount_groupbyid.size()
    return count

In [5]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users.
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]

    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]

    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId')
    return tp, usercount, itemcount

In [6]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data_full, min_uc=5, min_sc=0)

sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] *
item_popularity.shape[0])
print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" %
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm] # shuffle

After filtering, there are 20000263 watching events from 138493 users and 26744 movies (sparsity: 0.540%)


In [7]:
# create train/validation/test users
n_users = unique_uid.size
n_heldout_users = 10000

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]

raw_data["orig_index"] = raw_data.index.values


only using movie ids found in training, not cold start users



In [8]:
unique_sid = pd.unique(train_plays['movieId'])
unique_sid_df = pd.DataFrame(unique_sid, columns=["movieId"])
unique_sid_df["sid"] = range(unique_sid_df.shape[0])
unique_uid_df = pd.DataFrame(unique_uid, columns=["userId"]) # userid, uid
unique_uid_df["uid"] = range(unique_uid_df.shape[0])

with open(os.path.join(out_data_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
    
latest_movie_id = unique_sid_df.rename(columns={"movieId": "new_movieId"})
map_bwteen_3_and_1 = latest_movie_id.merge(movie_id_map, on='new_movieId')[["sid", "movieId"]]
map_bwteen_3_and_1.index = map_bwteen_3_and_1["movieId"]
reverse = dict(map_bwteen_3_and_1['sid'])

In [10]:
with open('ml-20m/mapping_ml.pkl', 'wb') as f:
    pickle.dump(reverse, f)

Use movie embeddings generated from MP net to encode users, map using the new ids created

In [11]:
emb_filpath = movie_emb_path + movie_emb_f
with open(emb_filpath, 'rb') as f:
    movie_embedding_dict = pickle.load(f)


new_movie_embedding_dict = {reverse[k]:v for k,v in movie_embedding_dict.items() if k in reverse}

In [12]:
max(new_movie_embedding_dict.keys())

26163

In [13]:
# create a dictionary to store the encoded embeddings for users
with open(new_movie_emb, 'wb') as f:
    pickle.dump(new_movie_embedding_dict, f)

KeyboardInterrupt: 

In [18]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group) # n records for this user

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool') # array([False, False, False])
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list) # data frame containinga ll columns not just the item id
    data_te = pd.concat(te_list)

    return data_tr, data_te



In [19]:
train_plays = train_plays.merge(unique_uid_df, on='userId', how='left')
train_plays = train_plays.merge(unique_sid_df, on='movieId', how='left')
train_plays["uid_fm0"] = train_plays["uid"] # add the same column as valid and test dataset

train_plays_profile = train_plays.drop_duplicates(subset="uid_fm0").filter(regex="^[ugac].*") # unique data for each user
assert train_plays_profile['uid'].shape[0] == n_users - n_heldout_users * 2

vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

vad_plays = vad_plays.merge(unique_uid_df, on='userId', how='left')
vad_plays = vad_plays.merge(unique_sid_df, on='movieId', how='left')

vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)
vad_plays_tr.reset_index(drop=True, inplace=True)
vad_plays_te.reset_index(drop=True, inplace=True)

test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

test_plays = test_plays.merge(unique_uid_df, on='userId', how='left')
test_plays = test_plays.merge(unique_sid_df, on='movieId', how='left')

test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)
test_plays_tr.reset_index(drop=True, inplace=True)
test_plays_te.reset_index(drop=True, inplace=True)

train_data = train_plays.filter(items=["uid", "sid"], axis=1)
train_data.to_csv(os.path.join(out_data_dir, 'train.csv'), index=False)

vad_data_tr = vad_plays_tr.filter(items=["uid", "sid"], axis=1)
vad_data_tr.to_csv(os.path.join(out_data_dir, 'validation_tr.csv'), index=False)

vad_data_te = vad_plays_te.filter(items=["uid", "sid"], axis=1)
vad_data_te.to_csv(os.path.join(out_data_dir, 'validation_te.csv'), index=False)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled
0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [20]:
start_idx = min(vad_plays_tr['uid'].min(), vad_plays_te['uid'].min())
end_idx = max(vad_plays_tr['uid'].max(), vad_plays_te['uid'].max())
vad_plays_tr['uid_fm0'] = vad_plays_tr['uid'] - start_idx
vad_plays_te['uid_fm0'] = vad_plays_te['uid'] - start_idx

test_data_tr = test_plays_tr.filter(items=["uid", "sid"], axis=1)
test_data_tr.to_csv(os.path.join(out_data_dir, 'test_tr.csv'), index=False)

test_data_te = test_plays_te.filter(items=["uid", "sid"], axis=1)
test_data_te.to_csv(os.path.join(out_data_dir, 'test_te.csv'), index=False)

start_idx = min(test_plays_tr['uid'].min(), test_plays_te['uid'].min())
end_idx = max(test_plays_tr['uid'].max(), test_plays_te['uid'].max())
test_plays_tr['uid_fm0'] = test_plays_tr['uid'] - start_idx
test_plays_te['uid_fm0'] = test_plays_te['uid'] - start_idx

unique_sid = list()
with open(os.path.join(out_data_dir, 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

n_items = len(unique_sid)

create sparse datasets

In [21]:
def load_train_data(csv_file):
    tp = pd.read_csv(csv_file)
    n_users = tp['uid'].max() + 1

    rows, cols = tp['uid'], tp['sid']
    data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype='float64', shape=(n_users, n_items))
    return data

train_data_csr = load_train_data(os.path.join(out_data_dir, 'train.csv'))

def load_tr_te_data(csv_file_tr, csv_file_te):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())
    assert pd.unique(tp_tr["uid"]).shape[0] == end_idx - start_idx + 1
    assert pd.unique(tp_te["uid"]).shape[0] == end_idx - start_idx + 1

    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

    data_tr = sparse.csr_matrix((np.ones_like(rows_tr), (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te), (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    return data_tr, data_te

vad_data_tr_csr, vad_data_te_csr = load_tr_te_data(os.path.join(out_data_dir, 'validation_tr.csv'), os.path.join(out_data_dir, 'validation_te.csv'))

test_data_tr_csr, test_data_te_csr = load_tr_te_data(os.path.join(out_data_dir, 'test_tr.csv'), os.path.join(out_data_dir, 'test_te.csv'))

fname = os.path.join(out_data_dir, 'data_csr.pkl')
datas = [train_data_csr, vad_data_tr_csr, vad_data_te_csr, test_data_tr_csr, test_data_te_csr]
save_weights_pkl(fname, datas)
