In [20]:
# Import key libraries
import pandas as pd
import numpy as np
from zipfile import ZipFile
from pathlib import Path
from tensorflow import keras

In [21]:
# Query data from online source and store in directory
movielens_data_file_url = (
    "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip_reader:
        # Extract files
        print("Extracting all the files now...")
        zip_reader.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"

In [22]:
# Read in csv file and store in a dataframe
df = pd.read_csv(ratings_file, header=0)
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [24]:
# Only keep movies which have more than 5 occurrences (> 5 user ratings)
unique, count = np.unique(df["movieId"], return_counts=True)
movie_counts = list(zip(unique, count))
kept_movies = [m[0] for m in movie_counts if m[1] > 5]
print("Total Kept Movies:", len(kept_movies))
kept_movies[:15]

Total Kept Movies: 3268


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [25]:
# Create new dataframe using only > 5 occurring movies
output = df[df["movieId"].isin(kept_movies)]
output

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047


In [26]:
# Compare old data length to new data length
print("Initial Unique Movie Count:", len(df["movieId"].unique()))
print("After Processing Unique Movie Count:", len(output["movieId"].unique()))

Initial Unique Movie Count: 9724
After Processing Unique Movie Count: 3268


In [27]:
# Store new data in pkl format
output.to_pickle("data/ratings.pkl")
pd.read_pickle("data/ratings.pkl")

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
