In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Load Data

In [26]:
dataset_name = 'ml-20m'

In [27]:
ratings = pd.read_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/ratings.csv')
movies = pd.read_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/movies.csv')

## Filter interaction, and re-id the items and users 

### Count Filter. Design choice count < 20.

In [28]:
user_id_count = {}
for userID, movieID, rating in zip(ratings['userId'], ratings['movieId'], ratings['rating']):
    user_id_count[userID] = user_id_count.get(userID, 0) + 1
user_reject_list = [userID for userID, count in user_id_count.items() if count < 20]
ratings = ratings[~ratings['userId'].isin(user_reject_list)]

movie_id_count = {}
for userID, movieID, rating in zip(ratings['userId'], ratings['movieId'], ratings['rating']):
    movie_id_count[movieID] = movie_id_count.get(movieID, 0) + 1

count_10_movies = [movieID for movieID, count in movie_id_count.items() if count < 10]
    

In [29]:
len(count_10_movies)

11293

### Create User dict and item dict

In [30]:
user2idx = {userID: i for i, userID in enumerate(ratings['userId'].unique())}
movie2idx = {movieID: i for i, movieID in enumerate(ratings['movieId'].unique())}
idx2movie = {i: movieID for i, movieID in enumerate(ratings['movieId'].unique())}
ratings['userId'] = ratings['userId'].map(user2idx)
ratings['movieId'] = ratings['movieId'].map(movie2idx)
count_10_movies = [movie2idx[movieID] for movieID in count_10_movies]

In [31]:
len(set(movies.movieId)) - len(set(ratings.movieId))

534

In [32]:
print('Number of users:', len(user2idx))
print('Number of movies:', len(movie2idx))
print('Number of ratings:', len(ratings))

Number of users: 138493
Number of movies: 26744
Number of ratings: 20000263


## Get the interaction in the form of an adjacency for leave-k-out
Design choice: we are recording implicit feedbacks here.

In [33]:
implicit_rating_matrix_dict = {}
movieid_set = set()
for userID, movieID, rating in zip(ratings['userId'], ratings['movieId'], ratings['rating']):
    if userID not in implicit_rating_matrix_dict:
        implicit_rating_matrix_dict[userID] = [movieID]
    else:
        implicit_rating_matrix_dict[userID].append(movieID)
    movieid_set.add(movieID)

In [34]:
print('Number of users:', len(implicit_rating_matrix_dict))
print('Number of movies:', len(movieid_set))

Number of users: 138493
Number of movies: 26744


## Do a leave-k-out style split

In [88]:
def leave_n_percent_out(data, n=10):
    user_list = []
    item_train_list = []
    item_val_list = []
    item_test_list = []
    item_gt_list = []
    movie_set = set()
    movie_set_indv = set()
    for user, item_list in data.items():
        item_gt_list.append(item_list)
        k = len(item_list) * n // 100
        if k == 0:
            print('user:', user, 'has less than 20 items')
        # randomly select 2k items from the item list
        val_test_items = np.random.choice(np.setdiff1d(item_list, count_10_movies), size=2*k, replace=False)
        val_items = list(val_test_items[:k])
        test_items = list(val_test_items[k:])
        train_items = list(np.setdiff1d(item_list, val_test_items))
        user_list.append(user)
        item_train_list.append(train_items)
        item_val_list.append(val_items)
        item_test_list.append(test_items)
        for item in item_list:
            movie_set.add(item) 
        for item in train_items:
            movie_set_indv.add(item)
        # for item in val_items:
        #     movie_set_indv.add(item)
        # for item in test_items:
        #     movie_set_indv.add(item)
    print('number of items:', len(movie_set)) 
    print('number of items in total:', len(movie_set_indv))

    return user_list, item_gt_list, item_train_list, item_val_list, item_test_list

## Get the dataset splits
- design choice:  90% 5% 5% randomly selected

In [89]:
user_list, item_gt_list, item_train_list, item_val_list, item_test_list = leave_n_percent_out(implicit_rating_matrix_dict, n=5)

number of items: 26744
number of items in total: 26744


In [90]:
# test + train + val = all items
for i in range(len(user_list)):
    assert set(item_train_list[i] + item_val_list[i] + item_test_list[i]) == set(implicit_rating_matrix_dict[user_list[i]])

# no overlap between train, val and test
for i in range(len(user_list)):
    assert len(np.intersect1d(item_train_list[i], item_val_list[i])) == 0
    assert len(np.intersect1d(item_train_list[i], item_test_list[i])) == 0
    assert len(np.intersect1d(item_val_list[i], item_test_list[i])) == 0

# no empty train, val and test
for i in range(len(user_list)):
    assert len(item_train_list[i]) > 0
    assert len(item_val_list[i]) > 0
    assert len(item_test_list[i]) > 0

# train set contains all items
train_item_set = set()
for i in range(len(user_list)):
    for item in item_train_list[i]:
        train_item_set.add(item)

test_item_set = set()
for i in range(len(user_list)):
    for item in item_test_list[i]:
        test_item_set.add(item)

val_item_set = set()
for i in range(len(user_list)):
    for item in item_val_list[i]:
        val_item_set.add(item)
assert train_item_set.union(test_item_set).union(val_item_set) == set(movie2idx.values())

# no overlap between train, val and test
for i in range(len(user_list)):
    for item in item_train_list[i]:
        assert item not in item_val_list[i]
        assert item not in item_test_list[i]
    for item in item_val_list[i]:
        assert item not in item_train_list[i]
        assert item not in item_test_list[i]
    for item in item_test_list[i]:
        assert item not in item_train_list[i]
        assert item not in item_val_list[i]

## Create a DF out of the adjencency data

In [None]:
def adjacency2df(user_list, item_list_of_list):
    user = []
    item = []
    for i in range(len(user_list)):
        user.extend([user_list[i]] * len(item_list_of_list[i]))
        item.extend(item_list_of_list[i])
    return pd.DataFrame({'userId': user, 'movieId': item})

## Print/ Save test train val and id dict

In [41]:
train_df = adjacency2df(user_list, item_train_list)
val_df = adjacency2df(user_list, item_val_list)
test_df = adjacency2df(user_list, item_test_list)
gt_df = adjacency2df(user_list, item_gt_list)

In [42]:
gt_df.to_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/gt.csv', index=False)
train_df.to_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/train.csv', index=False)
val_df.to_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/val.csv', index=False)
test_df.to_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/test.csv', index=False)

In [46]:
user2id_df = pd.DataFrame({'original_userId': list(user2idx.keys()), 'userId': list(user2idx.values())})
movie2id_df = pd.DataFrame({'original_movieId': list(movie2idx.keys()), 'movieId': list(movie2idx.values())})
user2id_df.to_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/user2id.csv', index=False)
movie2id_df.to_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/movie2id.csv', index=False)

## Tags and genre integration with dataset

### Load the genre2movies
- Generate statistics of what movies are in genre2movies 
- What movies are in movielens catelog.
- What movies are in the movielens ratings (not every movie is in the catelog.)

In [47]:
genre2movies = pd.read_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/genre2movies/genre2movies.csv')
movie_name2lensid = pd.read_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/ml-20m/movies.csv')

In [49]:
print('Number of movies in movie lens catelog:', len(set(movies.title)))
print('Number of movies in genre2movies:', len(genre2movies.movie.unique()))
print('Number of movies in movielens ratings', len(set(ratings.movieId)))
print('Number of movies in genre2movies and in movielense catelog', len(set(movies.title).intersection(set(genre2movies.movie.unique()))))
print('Number of movies in genre2movies and not in movielense catelog', len(set(genre2movies.movie.unique()) - set(movies.title)))
print('Number of movies in movielense catelog and not in genre2movies', len(set(movies.title).difference(genre2movies.movie.unique())))

Number of movies in movie lens catelog: 27262
Number of movies in genre2movies: 25877
Number of movies in movielens ratings 26744
Number of movies in genre2movies and in movielense catelog 25877
Number of movies in genre2movies and not in movielense catelog 0
Number of movies in movielense catelog and not in genre2movies 1385


In [50]:
count = 0
for name, id in zip(movies.title, movies.movieId):
    if id not in movie2idx:
        if name in genre2movies.movie.unique():
            count += 1
print('Number of movies in genre2movies and not in ratings', count)

Number of movies in genre2movies and not in ratings 473


### Map the movies from the genre2movies to the movielens

In [51]:
movie_name2id = {}
count = 0
for movie, id in zip(movie_name2lensid.title, movie_name2lensid.movieId):
    if id not in movie2idx:
        count+=1
        continue
    movie_name2id[movie] = movie2idx[id]
print(count)

534


In [52]:
genre2movieid = {}
movie_not_in_dataset = set()
movie_in_dataset = set()
for movie, genre in zip(genre2movies.movie, genre2movies.genre):
    if movie in movie_name2id:
        if genre not in genre2movieid:
            genre2movieid[genre] = [movie_name2id[movie]]
        else:
            genre2movieid[genre].append(movie_name2id[movie])
        movie_in_dataset.add(movie)
    else:
        movie_not_in_dataset.add(movie)

print(f"# {len(movie_not_in_dataset)} Movie not found in the dataset:")
print(f"# {len(movie_in_dataset)} Movie found in the dataset:")

# 473 Movie not found in the dataset:
# 25404 Movie found in the dataset:


### Create Genre, user to movies

In [53]:
movieid2genre = {}
for genre, movieid_list in genre2movieid.items():
    for movieid in movieid_list:
        if movieid not in movieid2genre:
            movieid2genre[movieid] = [genre]
        else:
            movieid2genre[movieid].append(genre)

In [54]:
genre_user2movieid = {}
movies_not_found = set()
movies_found = set()
total_movies = set()
for user, item_list in tqdm(implicit_rating_matrix_dict.items()):
    total_movies.update(item_list)
    for item in item_list:
        if item in movieid2genre:
            genre_list = movieid2genre[item]
            for genre in genre_list:
                if (genre, user) not in genre_user2movieid:
                    genre_user2movieid[(genre, user)] = {item}
                else:
                    genre_user2movieid[(genre, user)].add(item)
            movies_found.add(item)
        else:
            movies_not_found.add(item)

100%|██████████| 138493/138493 [01:19<00:00, 1743.37it/s]


#### Desgin Choice: only frequency greater than 20 considered.

In [55]:
#filter out the (genre, movies) pair that has less than 20 movies
genre_user2movieid_filtered = {}
for (genre, user), movieid_set in genre_user2movieid.items():
    if len(movieid_set) >= 20:
        genre_user2movieid_filtered[(genre, user)] = movieid_set
print(f"Number of (genre, user) tuple to evaluate on : {len(genre_user2movieid_filtered)}")

Number of (genre, user) tuple to evaluate on : 1040492


### Save the genre, user, item dataset

In [56]:
user_list = []
genre_list = []
item_list = []
for (genre, user), items in genre_user2movieid_filtered.items():
    user_list.extend([user] * len(items))
    genre_list.extend([genre] * len(items))
    item_list.extend(list(items))
genre_user_item_df = pd.DataFrame({'userId': user_list, 'genre': genre_list, 'movieId': item_list})
genre_user_item_df.to_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/genre_user_item.csv', index=False)

### Get tag data from Movielens

In [57]:
tags_df = pd.read_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/tags.csv')

In [81]:
def get_tag2movieid(tags_df):
    tag2movieid = {}
    for movieid, tag in zip(tags_df.movieId, tags_df.tag):
        tag = str(tag).lower()
        if movieid in movie2idx:
            if tag in tag2movieid:
                tag2movieid[tag].add(movie2idx[movieid])
            else:
                tag2movieid[tag] = {movie2idx[movieid]}
    return tag2movieid

In [82]:
tag2movieid = get_tag2movieid(tags_df)
tag_vocab = tag2movieid.keys()
tag2id = {tag: i for i, tag in enumerate(tag_vocab)}
tagid2movieid = {tag2id[tag]: list(movieid_set) for tag, movieid_set in tag2movieid.items()}

In [83]:
def dict2df(_dict, key_name='key', value_name='value'):
    key_list = []
    value_list = []
    for key, value in _dict.items():
        key_list.extend([key] * len(value))
        value_list.extend(list(value))
    return pd.DataFrame({key_name: key_list, value_name: value_list})


In [61]:
tagid2movieid_df = dict2df(tagid2movieid, key_name='tagId', value_name='movieId')
tag2id_df = pd.DataFrame({'tag': list(tag2id.keys()), 'tagId': list(tag2id.values())})
tag2id_df.to_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/tag2id.csv', index=False)
tagid2movieid_df.to_csv(f'/Users/ssdasgupta/research/set-based-collaborative-filtering/data/{dataset_name}/tag2movie.csv', index=False)

In [63]:
tag2movieid_df['tagId'].nunique()

34999

In [74]:
tag2movieid = {}
missed_tag = set()
total_tags = set()
included_tags = set()
for movieid, tag in zip(tags_df.movieId, tags_df.tag):
    tag = str(tag).lower()
    total_tags.add(tag)
    if movieid not in movie2idx:
        missed_tag.add(tag)
        continue
    else:
        included_tags.add(tag)
    if tag in tag2id:
        tagid = tag2id[tag]
        if tagid in tag2movieid:
            tag2movieid[tagid].add(movie2idx[movieid])
        else:
            tag2movieid[tagid] = {movie2idx[movieid]}

In [85]:
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078
...,...,...,...,...
465559,138446,55999,dragged,1358983772
465560,138446,55999,Jason Bateman,1358983778
465561,138446,55999,quirky,1358983778
465562,138446,55999,sad,1358983772


In [79]:
len(missed_tag.difference(included_tags))

174

In [70]:
len(tag_vocab) - len(tag2movieid.keys())

174

## Some dummy codes

In [None]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader

def load_data(data_path, batch_size):
    data = pd.read_csv(data_path)
    user = data['user_id'].values
    item = data['item_id'].values
    dataset = TensorDataset(torch.LongTensor(user), torch.LongTensor(item))
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_loader = load_data('/Users/ssdasgupta/research/set-based-collaborative-filtering/data/ml-latest-small/train.csv', 32)
val_loader = load_data('/Users/ssdasgupta/research/set-based-collaborative-filtering/data/ml-latest-small/val.csv', 32)
test_loader = load_data('/Users/ssdasgupta/research/set-based-collaborative-filtering/data/ml-latest-small/test.csv', 32)


In [None]:
for batch in train_loader:
    user = batch[0]
    item = batch[1]
    print(user, item)