In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Getting Started MovieLens: Download and Convert

## MovieLens 100K Dataset

The [MovieLens 100K Dataset](https://grouplens.org/datasets/movielens/100k/) is a popular dataset for recommender systems and is used in academic publications. The dataset contains 100,000 movie ratings for 1682 movies given by 943  users. Many projects use only the user/item/rating information of MovieLens, but the original dataset provides metadata for the movies, as well. For example, which genres a movie has.

In this set of notebooks we will explore enriching the data with poster information (images).

## Download the dataset

In [2]:
# External dependencies
import os

from merlin.core.utils import download_file

# Get dataframe library - cudf or pandas
from merlin.core.dispatch import get_lib
import pandas as pd
import numpy as np

df_lib = get_lib()

We define our base input directory, containing the data.

In [3]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("~/nvt-examples/movielens_100k/data/")
)

In [4]:
download_file(
    "https://files.grouplens.org/datasets/movielens/ml-100k.zip",
    os.path.join(INPUT_DATA_DIR, "ml-100k.zip"),
)

downloading ml-100k.zip: 4.94MB [00:03, 1.65MB/s]                                                                                                                                                                                                                                                                                                                                         
unzipping files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 342.43files/s]


## Convert the dataset

Let's take a look at the movie metadata.

In [5]:
movies = pd.read_csv(
    os.path.join(INPUT_DATA_DIR, "ml-100k/u.item"),
    sep='|',
    header=None,
    encoding='latin-1'
) # if we attempt to read the file directly using cudf we encounter an encoding error down the road
movies = df_lib.DataFrame(movies)
movies.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Let's reshapee the `movies` dataframe so that the genre information is stored in a single column. This will make it a bit more wieldy to work with down the road.

We will also give meaningful names to our columns and retain only the ones that we are likely to need.

In [6]:
genres = df_lib.read_csv(os.path.join(INPUT_DATA_DIR, "ml-100k/u.genre"), header=None, names=['genre'], sep='|')

In [7]:
genres_single_column = movies.to_pandas().iloc[:, 5:].apply(
    lambda row: [genres.loc[i, 'genre'] for i in np.where(row)[0]], axis=1
)

In [8]:
movies = pd.concat([movies.drop(columns=range(5, 24)).to_pandas(), genres_single_column], axis=1)

In [9]:
movies = movies.iloc[:, [1, -1]]
movies.columns = ['Name', 'genre']

In [10]:
movies.head()

Unnamed: 0,Name,genre
0,Toy Story (1995),"[Animation, Children's, Comedy]"
1,GoldenEye (1995),"[Action, Adventure, Thriller]"
2,Four Rooms (1995),[Thriller]
3,Get Shorty (1995),"[Action, Comedy, Drama]"
4,Copycat (1995),"[Crime, Drama, Thriller]"


Let's store the movies information for future use.

In [11]:
movies.to_parquet(os.path.join(INPUT_DATA_DIR, "movies_converted.parquet"))

## Splitting into train and validation dataset

Let us read in the user - movie_id - rating information.

In [12]:
ratings = df_lib.read_csv(
    os.path.join(INPUT_DATA_DIR, "ml-100k/u.data"),
    sep='\t',
    header=None,
    names=['user_id', 'movie_id', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


We drop the timestamp column and split the ratings into training and test datasets. We use a simple random split.

In [13]:
ratings = ratings.drop("timestamp", axis=1)

# shuffle the dataset
ratings = ratings.sample(len(ratings), replace=False)

# split the train_df as training and validation data sets.
num_valid = int(len(ratings) * 0.2)

train = ratings[:-num_valid]
valid = ratings[-num_valid:]

We save the dataset to disk.

In [14]:
train.to_parquet(os.path.join(INPUT_DATA_DIR, "train.parquet"))
valid.to_parquet(os.path.join(INPUT_DATA_DIR, "valid.parquet"))

## Next steps

If you wish to download movie posters for the MovieLens 100k, then proceed through notebooks 02-04.

If you wish to use synthetic data, then proceed to notebook 05, synthetic data section.