In [1]:
import pandas as pd
import numpy as np
import random
seed = 123
random.seed(seed)

## Load data

In [2]:
# load data from .dat file
data_path = '../data/ml-1m/ratings.dat'
ratings = pd.read_csv(data_path, sep='::', header=None, engine='python', names=['userId', 'movieId', 'rating', 'timestamp'])

In [3]:
user_id_count = {}
for userID, movieID, rating in zip(ratings['userId'], ratings['movieId'], ratings['rating']):
    user_id_count[userID] = user_id_count.get(userID, 0) + 1
user_reject_list = [userID for userID, count in user_id_count.items() if count < 20]
ratings = ratings[~ratings['userId'].isin(user_reject_list)]

movie_id_count = {}
for userID, movieID, rating in zip(ratings['userId'], ratings['movieId'], ratings['rating']):
    movie_id_count[movieID] = movie_id_count.get(movieID, 0) + 1

count_10_movies = [movieID for movieID, count in movie_id_count.items() if count < 10]

## Re-id data and vocab creation

In [4]:
user2idx = {userID: i for i, userID in enumerate(ratings['userId'].unique())}
movie2idx = {movieID: i for i, movieID in enumerate(ratings['movieId'].unique())}
idx2movie = {i: movieID for i, movieID in enumerate(ratings['movieId'].unique())}
ratings['userId'] = ratings['userId'].map(user2idx)
ratings['movieId'] = ratings['movieId'].map(movie2idx)
count_10_movies = [movie2idx[movieID] for movieID in count_10_movies]

In [5]:
print('Number of users:', len(user2idx))
print('Number of movies:', len(movie2idx))
print('Number of ratings:', len(ratings))

Number of users: 6040
Number of movies: 3706
Number of ratings: 1000209


## Adj matrix with time-stamp

In [6]:
implicit_rating_matrix_dict = {}
movieid_set = set()
for userID, movieID, rating, timestamp in zip(ratings['userId'], ratings['movieId'], ratings['rating'], ratings['timestamp']):
    if userID not in implicit_rating_matrix_dict:
        implicit_rating_matrix_dict[userID] = [(movieID, timestamp)]
    else:
        implicit_rating_matrix_dict[userID].append((movieID, timestamp))
    movieid_set.add(movieID)


In [7]:
print('Number of users:', len(implicit_rating_matrix_dict))
print('Number of movies:', len(movieid_set))

Number of users: 6040
Number of movies: 3706


## Test / Val / Train split

In [None]:
user_list = []
item_train_list = []
item_test_list = []
item_valid_list = []
item_gt_list = []
for userID, movieID_timestamp_list in implicit_rating_matrix_dict.items():
    movieID_timestamp_list = sorted(movieID_timestamp_list, key=lambda x: x[1])
    movieID_list = [movieID for movieID, timestamp in movieID_timestamp_list]
    item_gt_list.append(movieID_list)
    user_list.append(userID)
    item_train_list.append(movieID_list[:-2])
    item_valid_list.append(movieID_list[-2])
    item_test_list.append(movieID_list[-1])

In [None]:
def adjacency2df(user_list, item_list_of_list):
    user = []
    item = []
    for i in range(len(user_list)):
        user.extend([user_list[i]] * len(item_list_of_list[i]))
        item.extend(item_list_of_list[i])
    return pd.DataFrame({'userId': user, 'movieId': item})

In [None]:
adjacency2df(user_list, item_train_list).to_csv('../data/ml-1m/train.csv', index=False)
adjacency2df(user_list, item_gt_list).to_csv('../data/ml-1m/gt.csv', index=False)
pd.DataFrame({'userId': user_list, 'movieId': item_valid_list}).to_csv('../data/ml-1m/val.csv', index=False)
pd.DataFrame({'userId': user_list, 'movieId': item_test_list}).to_csv('../data/ml-1m/test.csv', index=False)

In [None]:
user2id_df = pd.DataFrame({'original_userId': list(user2idx.keys()), 'userId': list(user2idx.values())})
movie2id_df = pd.DataFrame({'original_movieId': list(movie2idx.keys()), 'movieId': list(movie2idx.values())})
user2id_df.to_csv(f'../data/ml-1m/user2id.csv', index=False)
movie2id_df.to_csv(f'../data/ml-1m/movie2id.csv', index=False)

## Generate negative samples for valid and test

In [None]:
def get_hundred_negatives(item_train_list, item_valid_list, item_test_list, user_list, movie2idx):
    val_list = []
    test_list = []
    for i, userID in enumerate(user_list):
        exclude_list = item_train_list[i] + [item_valid_list[i]] + [item_test_list[i]]
        negative_sample_init = random.sample(range(len(movie2idx)), 500)
        for ele in exclude_list:
            if ele in negative_sample_init:
                negative_sample_init.remove(ele)
        if len(negative_sample_init) > 200:
            val_negative_sample = negative_sample_init[:100]
            test_negative_sample = negative_sample_init[100:200]
        else:
            print("need to resample")
            continue
        val_list.append(val_negative_sample + [item_valid_list[i]])
        test_list.append(test_negative_sample + [item_test_list[i]])
    return val_list, test_list

In [None]:
val_list, test_list = get_hundred_negatives(item_train_list, item_valid_list, item_test_list, user_list, movie2idx)

In [None]:
get_dict = lambda x, y: {y[i]: x[i] for i in range(len(x))}

In [None]:
val_dict = get_dict(val_list, user_list)
test_dict = get_dict(test_list, user_list)
# dict to dataframe
val_df = pd.DataFrame(val_dict).to_csv('../data/ml-1m/val_101.csv', index=False)
test_df = pd.DataFrame(test_dict).to_csv('../data/ml-1m/test_101.csv', index=False)

## Duplicate Pipe for Generating negative samples for valid and test

In [8]:
user_list = []
item_train_list = []
item_valid_list = []
item_gt_list = []
for userID, movieID_timestamp_list in implicit_rating_matrix_dict.items():
    movieID_timestamp_list = sorted(movieID_timestamp_list, key=lambda x: x[1])
    movieID_list = [movieID for movieID, timestamp in movieID_timestamp_list]
    item_gt_list.append(movieID_list)
    user_list.append(userID)
    item_train_list.append(movieID_list[:-1])
    item_valid_list.append(movieID_list[-1])

In [9]:
def adjacency2df(user_list, item_list_of_list):
    user = []
    item = []
    for i in range(len(user_list)):
        user.extend([user_list[i]] * len(item_list_of_list[i]))
        item.extend(item_list_of_list[i])
    return pd.DataFrame({'userId': user, 'movieId': item})

In [10]:
adjacency2df(user_list, item_train_list).to_csv('../data/ml-1m/train.csv', index=False)
adjacency2df(user_list, item_gt_list).to_csv('../data/ml-1m/gt.csv', index=False)
pd.DataFrame({'userId': user_list, 'movieId': item_valid_list}).to_csv('../data/ml-1m/val.csv', index=False)

In [11]:
user2id_df = pd.DataFrame({'original_userId': list(user2idx.keys()), 'userId': list(user2idx.values())})
movie2id_df = pd.DataFrame({'original_movieId': list(movie2idx.keys()), 'movieId': list(movie2idx.values())})
user2id_df.to_csv(f'../data/ml-1m/user2id.csv', index=False)
movie2id_df.to_csv(f'../data/ml-1m/movie2id.csv', index=False)

In [12]:
def get_hundred_negatives(item_train_list, item_valid_list, user_list, movie2idx):
    val_list = []
    for i, userID in enumerate(user_list):
        exclude_list = item_train_list[i] + [item_valid_list[i]]
        negative_sample_init = random.sample(range(len(movie2idx)), 300)
        for ele in exclude_list:
            if ele in negative_sample_init:
                negative_sample_init.remove(ele)
        if len(negative_sample_init) > 100:
            val_negative_sample = negative_sample_init[:100]
        else:
            print("need to resample")
            continue
        val_list.append(val_negative_sample + [item_valid_list[i]])
    return val_list

In [13]:
val_list = get_hundred_negatives(item_train_list, item_valid_list, user_list, movie2idx)
get_dict = lambda x, y: {y[i]: x[i] for i in range(len(x))}
val_dict = get_dict(val_list, user_list)
# dict to dataframe
val_df = pd.DataFrame(val_dict).to_csv('../data/ml-1m/val_101.csv', index=False)

In [None]:
len(val_list[0])