In [1]:
from __future__ import print_function, division
from builtins import range, input
from collections import Counter

import pandas as pd
import numpy as np
import pickle

%matplotlib inline

In [None]:
# The csv file was downloaded from 
# https://www.kaggle.com/grouplens/movielens-20m-dataset
df = pd.read_csv('rating.csv')


In [None]:
# Note:
# user ids are ordered sequentially from 1..138493
# with no missing numbers
# movie ids are integers from 1..131262
# NOT all movie ids appear
# there are only 26744 movie ids


In [None]:
# make the user ids go from 0...N-1
df.userId = df.userId - 1

In [None]:
# create a mapping for movie ids
unique_movie_ids = set(df.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
  movie2idx[movie_id] = count
  count += 1

# add them to the data frame. Takes a while.
df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1)

df = df.drop(columns=['timestamp'])

df.to_csv('edited_rating.csv', index=False)


In [2]:
df = pd.read_csv('edited_rating.csv')
print("original dataframe size:", len(df))


original dataframe size: 20000263


In [3]:
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies

user_ids_count = Counter(df.userId)
movie_ids_count = Counter(df.movie_idx)

In [17]:
# Since the size of the dataset is too big, 
# I only keep 10000 users and 2000 movies.

In [4]:
# number of users and movies we would like to keep
n = 10000
m = 2000

user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]


In [5]:
# make a copy
df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()


In [6]:
# need to remake user ids and movie ids to make them sequential
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
print("i:", i)

new_movie_id_map = {}
j = 0
for old in movie_ids:
  new_movie_id_map[old] = j
  j += 1
print("j:", j)
print("Setting new ids")

i: 10000
j: 2000
Setting new ids


In [None]:
# Shrinking the dataset further
df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)

print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())

print("small dataframe size:", len(df_small))


In [None]:
df_small.to_csv('small_rating.csv', index=False)
