In [None]:
import os
import requests
from zipfile import ZipFile

# Prior to importing networkx, set the environment variable to enable nx-cugraph
# Note: this is done here for demonstration purposes but should be done in the
# calling environment to ensure the code is portable to systems without a GPU.
os.environ["NX_CUGRAPH_AUTOCONFIG"] = "True"

import pandas as pd
import networkx as nx

In [None]:
ratings_csv = "ml-latest/ratings.csv"
movies_csv = "ml-latest/movies.csv"

if not os.path.exists(ratings_csv) or not os.path.exists(movies_csv):
    zip_file = "ml-latest.zip"
    if not os.path.exists(zip_file):
        req = requests.get(
            "https://files.grouplens.org/datasets/movielens/" + zip_file)
        with open(zip_file, "wb") as f:
            f.write(req.content)
    with ZipFile(zip_file, "r") as z:
        z.extract(ratings_csv)
        z.extract(movies_csv)

In [None]:
ratings_df = pd.read_csv(ratings_csv,
                         dtype={"userId": "int32",
                                "movieId": "int32",
                                "rating": "float32",
                                "timestamp": "int32",
                                }
                         )
# Not using timestamp
ratings_df.drop(columns="timestamp", inplace=True)

# Both user and movie IDs start at 1
# Add offset to make userId and movieId values unique
max_movie_id = int(ratings_df["movieId"].max())
ratings_df["userId"] = ratings_df["userId"] + max_movie_id

all_user_ids = ratings_df["userId"].unique()
all_movie_ids = ratings_df["movieId"].unique()

In [None]:
movie_id_name_map = {}
with open(movies_csv) as f:
    for line in f.readlines():
        # Line format is: id,title,genres
        # Title may have "," in them, and will be in quotes if so
        items = line.split(",")
        try:
            mid = int(items[0])
        except ValueError:
            continue
        mname = ",".join(items[1:-1])
        movie_id_name_map[mid] = mname

In [None]:
# Create a separate DataFrame containing only "good" reviews (rating >= 3).
# This is used for finding similarities between good movies for
# recommendations, since jaccard does not consider edge weights (rating value)
# and would otherwise treat bad reviews and good reviews equally.
good_ratings_df = ratings_df[ratings_df["rating"] >= 3]
good_user_ids = good_ratings_df["userId"].unique()
good_movie_ids = good_ratings_df["movieId"].unique()

print(f"total number of users: {len(all_user_ids)}")
print(f"total number of reviews: {len(ratings_df)}")
print("average number of total reviews/user: "
      f"{len(ratings_df)/len(all_user_ids):.2f}")
print(f"total number of users with good ratings: {len(good_user_ids)}")
print(f"total number of good reviews: {len(good_ratings_df)}")
print("average number of good reviews/user: "
      f"{len(good_ratings_df)/len(good_user_ids):.2f}")

In [None]:
good_user_movie_G = nx.from_pandas_edgelist(
    good_ratings_df, source="userId", target="movieId", edge_attr="rating")

In [None]:
# Pick a user and one of their highly-rated movies
user = good_user_ids[321]
user_reviews = good_user_movie_G[user]
highest_rated_movie = max(
    user_reviews,
    key=lambda n: user_reviews[n].get("rating", 0)
)

print(f"highest rated movie for {user=} is "
      f"{movie_id_name_map[highest_rated_movie]}, "
      f"id: {highest_rated_movie}, "
      f"rated: {user_reviews[highest_rated_movie]}")

In [None]:
# Create a list of nodes to compare the user's highest
# rated movie to all other movies in the graph.
ebunch = [(highest_rated_movie, n) for n in good_movie_ids[1:]
          if n != highest_rated_movie]

In [None]:
%%time
# Run Jaccard Similarity 
jacc_coeffs = list(nx.jaccard_coefficient(good_user_movie_G, ebunch))

In [None]:
# Sort by coefficient value, which is the 3rd item in the tuples
jacc_coeffs.sort(key=lambda t: t[2], reverse=True)  

# Create a list of recommendations ordered by "best" to "worst" based on the
# Jaccard Similarity coefficients and the movies already seen
movies_seen = list(good_user_movie_G.neighbors(user))
recommendations = [mid for (_, mid, _) in jacc_coeffs
                   if mid not in movies_seen]
if len(recommendations) > 0:
    mid = recommendations[0]
    print(f"User ID {user} might like {movie_id_name_map[mid]} "
          f"(movie ID: {mid})")

In [None]:
def print_similar_movies(movie_id, n=10, backend="networkx"):
    # ebunch is the list of node pairs to generate Jaccard Similarity
    # coefficients for. This will generate a list of comparisons between
    # movie_id and every other movie in the graph
    ebunch = [(movie_id, n) for n in good_movie_ids[1:] if n != movie_id]

    jacc_coeffs = list(nx.jaccard_coefficient(good_user_movie_G, ebunch, backend=backend))

    jacc_coeffs.sort(key=lambda t: t[2], reverse=True)
    print(f"movies similar to {movie_id_name_map[movie_id]}:")
    for i in range(n):
        (_, movieId, similarity) = jacc_coeffs[i]
        print(f"{movieId=},\t{movie_id_name_map[movieId]})


In [None]:
%%time
print_similar_movies(highest_rated_movie)

In [None]:
%%time
# 1367: "101 Dalmatians (1996)"
print_similar_movies(1367)

In [None]:
%%time
# 1196: "Star Wars: Episode V - The Empire Strikes Back (1980)"
print_similar_movies(1196)

In [None]:
%%time
# 2105: "Tron (1982)"
print_similar_movies(2105)

In [None]:
%%time
# 4878: "Donnie Darko (2001)"
print_similar_movies(4878)

In [None]:
%%time
# 1301: "Forbidden Planet (1956)"
print_similar_movies(1301)

In [None]:
%%time
# 2139: ""Secret of NIMH, The (1982)""
print_similar_movies(2139)

In [None]:
%%time
# 106072: "Thor: The Dark World (2013)"
print_similar_movies(106072)

In [None]:
%%time
# 318: ""Shawshank Redemption, The (1994)""
print_similar_movies(318)