# Graph based approach

This notebook is a combination of multiple scripts, that have run independtly and have been combined to a single notebook, to make it easier to read. The scripts were coded independently, so they were easy to run on DTU's HPC environment.

## Imports

In [1]:
import networkx as nx
import pandas as pd
import pickle
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pyarrow as pa
from itertools import combinations
from joblib import Parallel, delayed
import matplotlib.pyplot as plt

## Generating the graph

In [None]:
ratings = pd.read_csv("data/ml-32m/ratings.csv")
movie_descs = pd.read_csv("data/movies_with_description.csv")

# Filter all ratings that not in movie_descs

ratings = ratings[ratings["movieId"].isin(movie_descs["movieId"])]

# Remove all ratings that are less than 5
ratings_filtered = ratings.query("rating > 4")


# Create an edgelist from the dataframe
edges = defaultdict(lambda: 0)

# Group movies by user
movies_per_user = ratings_filtered.groupby("userId").apply(
    lambda x: x["movieId"].values
)

# Create a function that can process the movies for a user independently
def process_movies(movies):
    local_edges = defaultdict(lambda: 0)
    for movie1, movie2 in combinations(movies, 2):
        key = tuple(sorted((movie1, movie2)))
        local_edges[key] += 1
    return local_edges

# Process the movies in parallel
results = Parallel(n_jobs=-1)(
    delayed(process_movies)(movies)
    for movies in tqdm(movies_per_user, desc="Creating edges")
)

# Combine the results
for local_edges in tqdm(results, desc="Combining results"):
    for key, value in local_edges.items():
        edges[key] += value

# Convert the edges to a pyarrow table
print("Save the edges to a parquet file")

edges_table = pa.Table.from_pandas(
    pd.DataFrame(
        [
            {"movie1": key[0], "movie2": key[1], "weight": value}
            for key, value in edges.items()
        ]
    )
)

# Save the table to a file
edges_table.to_pandas().to_parquet("data/edges-movie-all.parquet")

## Converting the edges to a NetworkX graph

In [None]:
edgelist = pd.read_parquet("data/edges-movie-all.parquet")
# Create a graph from the edgelist
G = nx.Graph()

total = len(edgelist)

for _, row in tqdm(edgelist.iterrows(), total=total):
    G.add_edge(row["movie1"], row["movie2"], weight=row["weight"])

with open("data/movie_graph.pickle", "wb") as f:
    pickle.dump(G, f)

## Predicting with the graph

### Function for predicting movies for a user

In [None]:
def predict_movies(
    G: nx.Graph,
    user_watched_movies: list[int],
    weighted: bool = True,
    combination: bool = False,
):
    """
    Predicts movies for a user based on their watched movies and a graph of movie relationships.

    Args:
        G: Graph representing movie relationships. Each edge can have a weight.
        user_watched_movies: List of movies watched by the user.
        weighted: Boolean indicating whether predictions should consider edge weights.

    Returns:
        A sorted list of recommended movie IDs based on their predicted relevance.
    """
    # Use a single dictionary for neighbor counts or weights
    recommendations = defaultdict(float)
    recommendations_weighted = defaultdict(float)

    # Iterate over each watched movie and process its neighbors
    for movie in user_watched_movies:
        for neighbor, edge_attrs in G.adj[movie].items():
            if neighbor not in user_watched_movies:
                weight = edge_attrs.get("weight", 1)
                recommendations[neighbor] += 1
                recommendations_weighted[neighbor] += weight

    # Convert recommendations to a DataFrame and sort by the chosen metric
    predictions = (
        pd.DataFrame.from_dict(recommendations, orient="index")
        .reset_index()
        .rename(columns={"index": "movieId", 0: "count"})
        .sort_values(by="count", ascending=False)
    )

    predictions_weighted = (
        pd.DataFrame.from_dict(recommendations, orient="index")
        .reset_index()
        .rename(columns={"index": "movieId", 0: "weight"})
        .sort_values(by="weight", ascending=False)
    )

    if combination:
        new_predictions = predictions.merge(
            predictions_weighted, on="movieId", how="inner"
        )

        new_predictions["combined"] = (
            np.log(new_predictions["count"]) * new_predictions["weight"]
        )

        return new_predictions.sort_values(by="combined", ascending=False)[
            "movieId"
        ].tolist()

    if weighted:
        return predictions_weighted["movieId"].tolist()

    return predictions["movieId"].tolist()

### Using the function to predict movies for the user subset

In [None]:
print("Loading graph...", flush=True)
with open("data/movie_graph.pickle", "rb") as f:
    G = pickle.load(f)
print("Graph loaded!", flush=True)


print("Loading data...", flush=True)
ratings = pd.read_csv("data/ml-32m/ratings.csv")
movie_descs = pd.read_csv("data/movies_with_description.csv")
ratings = ratings[ratings["movieId"].isin(movie_descs["movieId"])]
ratings = ratings[ratings["rating"] >= 5.0]
print("Data loaded!", flush=True)


print("Analysing predetermined users...", flush=True)
users_to_analyze = [304, 6741, 147001]

preds = {u: [] for u in users_to_analyze}
preds_weighted = {u: [] for u in users_to_analyze}
preds_combined = {u: [] for u in users_to_analyze}

for user in users_to_analyze:
    movies_watched = ratings[ratings["userId"] == user]["movieId"].tolist()

    preds[user] = predict_movies(G, movies_watched, weighted=True)
    preds_weighted[user] = predict_movies(G, movies_watched, weighted=True)
    preds_combined[user] = predict_movies(G, movies_watched, combination=True)

with open("data/predictions.pickle", "wb") as f:
    pickle.dump(preds, f)

with open("data/predictions_weighted.pickle", "wb") as f:
    pickle.dump(preds_weighted, f)

with open("data/predictions_combined.pickle", "wb") as f:
    pickle.dump(preds_combined, f)

print("Predictions saved!", flush=True)

## Evaluating the predictions

In [2]:
# Load the movies and ratings
ratings = pd.read_csv("data/ml-32m/ratings.csv")
movie_descs = pd.read_csv("data/movies_with_description.csv")
ratings = ratings[ratings["movieId"].isin(movie_descs["movieId"])]
ratings = ratings[ratings["rating"] >= 5.0]

# Load the predictions for the predetermined users
with open("data/predictions_weighted.pickle", "rb") as f:
    preds_weighted = pickle.load(f)

with open("data/predictions_combined.pickle", "rb") as f:
    preds_combined = pickle.load(f)

with open("data/predictions.pickle", "rb") as f:
    preds = pickle.load(f)

### Functions for displaying the predictions

In [3]:
def get_movies_watched_for_user(user_id):
    movieIds = ratings[ratings["userId"] == user_id]["movieId"].values

    return movie_descs[movie_descs["movieId"].isin(movieIds)]

def get_movie_by_ids(movie_ids: list[int]):
    return movie_descs[movie_descs["movieId"].isin(movie_ids)]

### Displaying the predictions

In [4]:
for user_id in preds_weighted.keys():
    print("Results for user", user_id)
    predicted_movies_for_user = preds_combined[user_id]

    movies_watched = get_movies_watched_for_user(user_id)

    print("Movies watched by user:")
    display(movies_watched)

    print("Movies recommended to user:")
    display(get_movie_by_ids(predicted_movies_for_user[:20]))

Results for user 304
Movies watched by user:


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,id,description
33,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,114814,629.0,114814,A sole survivor tells of the twisty events lea...
334,527,Schindler's List (1993),Drama|War,108052,424.0,108052,"In German-occupied Poland during World War II,..."
747,1200,Aliens (1986),Action|Adventure|Horror|Sci-Fi,90605,679.0,90605,Fifty-seven years after surviving an apocalypt...
3378,5010,Black Hawk Down (2001),Action|Drama|War,265086,855.0,265086,The story of 160 elite U.S. soldiers who dropp...
5616,8914,Primer (2004),Drama|Sci-Fi,390384,14337.0,390384,"Four friends/fledgling entrepreneurs, knowing ..."
7314,44191,V for Vendetta (2006),Action|Sci-Fi|Thriller|IMAX,434409,752.0,434409,"In a future British dystopian society, a shado..."
7584,48043,"Fountain, The (2006)",Drama|Fantasy|Romance,414993,1381.0,414993,"As a modern-day scientist, Tommy is struggling..."
8368,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,468569,155.0,468569,When the menace known as the Joker wreaks havo...


Movies recommended to user:


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,id,description
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,114709,A cowboy doll is profoundly threatened and jea...
2,6,Heat (1995),Action|Crime|Thriller,113277,949.0,113277,A group of high-end professional thieves start...
305,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,107290,329.0,107290,A pragmatic paleontologist touring an almost c...
454,733,"Rock, The (1996)",Action|Adventure|Thriller,117500,9802.0,117500,A mild-mannered chemist and an ex-con must lea...
480,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,116629,602.0,116629,The aliens are coming and their goal is to inv...
709,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,71853,762.0,71853,King Arthur and his Knights of the Round Table...
744,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,93779,2493.0,93779,"While home sick in bed, a young boy's grandfat..."
745,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,82971,85.0,82971,"In 1936, archaeologist and adventurer Indiana ..."
794,1262,"Great Escape, The (1963)",Action|Adventure|Drama|War,57115,5925.0,57115,Allied prisoners of war plan for several hundr...
819,1291,Indiana Jones and the Last Crusade (1989),Action|Adventure,97576,89.0,97576,"In 1938, after his father Professor Henry Jone..."


Results for user 6741
Movies watched by user:


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,id,description
33,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,114814,629.0,114814,A sole survivor tells of the twisty events lea...
709,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,71853,762.0,71853,King Arthur and his Knights of the Round Table...
745,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,82971,85.0,82971,"In 1936, archaeologist and adventurer Indiana ..."
760,1215,Army of Darkness (1993),Action|Adventure|Comedy|Fantasy|Horror,106308,766.0,106308,A sardonic hardware store clerk is accidentall...
802,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi,88763,105.0,88763,"Marty McFly, a 17-year-old high school student..."
819,1291,Indiana Jones and the Last Crusade (1989),Action|Adventure,97576,89.0,97576,"In 1938, after his father Professor Henry Jone..."
1011,1610,"Hunt for Red October, The (1990)",Action|Adventure|Thriller,99810,1669.0,99810,"In November 1984, the Soviet Union's best subm..."
1262,2028,Saving Private Ryan (1998),Action|Drama|War,120815,857.0,120815,"Following the Normandy Landings, a group of U...."
2037,3147,"Green Mile, The (1999)",Crime|Drama,120689,497.0,120689,The lives of guards on Death Row are affected ...
2670,4011,Snatch (2000),Comedy|Crime|Thriller,208092,107.0,208092,"Unscrupulous boxing promoters, violent bookmak..."


Movies recommended to user:


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,id,description
20,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,114746,63.0,114746,"In a future world devastated by disease, a con..."
31,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369,807.0,114369,"Two detectives, a rookie and a veteran, hunt a..."
181,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,110413,101.0,110413,12-year-old Mathilda is reluctantly taken in b...
215,353,"Crow, The (1994)",Action|Crime|Fantasy|Thriller,109506,9495.0,109506,A man brutally murdered comes back to life as ...
673,1079,"Fish Called Wanda, A (1988)",Comedy|Crime,95159,623.0,95159,"In London, four very different people team up ..."
681,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller,105236,500.0,105236,When a simple jewelry heist goes horribly wron...
682,1090,Platoon (1986),Drama|War,91763,792.0,91763,"Chris Taylor, a neophyte recruit in Vietnam, f..."
747,1200,Aliens (1986),Action|Adventure|Horror|Sci-Fi,90605,679.0,90605,Fifty-seven years after surviving an apocalypt...
1836,2843,"Black Cat, White Cat (Crna macka, beli macor) ...",Comedy|Romance,118843,1075.0,118843,Matko and his son Zare live on the banks of th...
1946,2997,Being John Malkovich (1999),Comedy|Drama|Fantasy,120601,492.0,120601,A puppeteer discovers a portal that leads lite...


Results for user 147001
Movies watched by user:


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,id,description
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,113497,When two kids find and play a magical board ga...
32,48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance,114148,10530.0,114148,An English soldier and the daughter of an Algo...
71,110,Braveheart (1995),Action|Drama|War,112573,197.0,112573,Scottish warrior William Wallace leads his cou...
106,168,First Knight (1995),Action|Drama|Romance,113071,6520.0,113071,"Lancelot falls in love with Guinevere, who is ..."
127,208,Waterworld (1995),Action|Adventure|Sci-Fi,114898,9804.0,114898,In a future where the polar ice-caps have melt...
156,253,Interview with the Vampire: The Vampire Chroni...,Drama|Horror,110148,628.0,110148,"A vampire tells his epic life story: love, bet..."
222,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,110357,8587.0,110357,Lion prince Simba and his father are targeted ...
403,653,Dragonheart (1996),Action|Adventure|Fantasy,116136,8840.0,116136,The last dragon and a disillusioned dragonslay...
639,1027,Robin Hood: Prince of Thieves (1991),Adventure|Drama,102798,8367.0,102798,Robin Hood decides to fight back as an outlaw ...
1377,2174,Beetlejuice (1988),Comedy|Fantasy,94721,4011.0,94721,The spirits of a deceased couple are harassed ...


Movies recommended to user:


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,id,description
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,114709,A cowboy doll is profoundly threatened and jea...
15,23,Assassins (1995),Action|Crime|Thriller,112401,9691.0,112401,Professional hit-man Robert Rath wants to fulf...
16,24,Powder (1995),Drama|Sci-Fi,114168,12665.0,114168,An off the charts genius who is home schooled ...
92,151,Rob Roy (1995),Action|Drama|Romance|War,114287,11780.0,114287,"In 1713 Scotland, Rob Roy MacGregor is wronged..."
126,204,Under Siege 2: Dark Territory (1995),Action,114781,3512.0,114781,Casey Ryback hops on a Colorado to LA train to...
164,262,"Little Princess, A (1995)",Children|Drama,113670,19101.0,113670,A young girl is relegated to servitude at a bo...
166,266,Legends of the Fall (1994),Drama|Romance|War|Western,110322,4476.0,110322,"In the early 1900s, three brothers and their f..."
181,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,110413,101.0,110413,12-year-old Mathilda is reluctantly taken in b...
201,329,Star Trek: Generations (1994),Adventure|Drama|Sci-Fi,111280,193.0,111280,With the help of long presumed dead Captain Ki...
213,349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller,109444,9331.0,109444,CIA Analyst Jack Ryan is drawn into an illegal...
