In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
from pathlib import Path
from tensorflow import keras

In [2]:
movielens_data_file_url = (
    "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"

In [3]:
df = pd.read_csv(ratings_file, header=0)

In [4]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
unique, count = np.unique(df["movieId"], return_counts=True)

In [6]:
movie_counts = list(zip(unique, count))

In [7]:
kept_movies = [m[0] for m in movie_counts if m[1] > 5]

In [8]:
len(kept_movies)

3268

In [9]:
output = df[df["movieId"].isin(kept_movies)]

In [10]:
output.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [11]:
len(df["movieId"].unique())

9724

In [12]:
len(output["movieId"].unique())

3268

In [13]:
output.to_pickle("data/ratings.pkl")

In [14]:
pd.read_pickle("data/ratings.pkl")

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
