In [38]:
import networkx as nx
import pandas as pd
import pickle
import numpy as np
from collections import defaultdict, Counter
from collections import Counter

In [39]:
# with open('data/movie_graph.pickle', 'rb') as f:
#     G = pickle.load(f)

# Create a new graph

G = nx.Graph()

edges = [
    (1, 4, 10),
    (1, 2, 6),
    (1, 3, 5),
    (1, 5, 11),
    (2, 3, 15),
    (2, 4, 4),
    (3, 4, 8),
    (3, 5, 9),
    (4, 5, 7)
]

G.add_weighted_edges_from(edges)

In [44]:
def predict_movies(G: nx.Graph, user_watched_movies: list[int], weighted: bool = True):
    """
    Predicts movies for a user based on their watched movies and a graph of movie relationships.

    Args:
        G: Graph representing movie relationships. Each edge can have a weight.
        user_watched_movies: List of movies watched by the user.
        weighted: Boolean indicating whether predictions should consider edge weights.

    Returns:
        A sorted list of recommended movie IDs based on their predicted relevance.
    """
    # Use a single dictionary for neighbor counts or weights
    recommendations = defaultdict(float)

    # Iterate over each watched movie and process its neighbors
    for movie in user_watched_movies:
        for neighbor, edge_attrs in G.adj[movie].items():
            if neighbor not in user_watched_movies:
                weight = edge_attrs.get("weight", 1)
                recommendations[neighbor] += weight if weighted else 1

    # Convert recommendations to a DataFrame and sort by the chosen metric
    metric = "weight" if weighted else "count"
    predictions = (
        pd.DataFrame.from_dict(recommendations, orient="index")
        .reset_index()
        .rename(columns={"index": "movieId", 0: metric})
        .sort_values(by=metric, ascending=False)
    )

    return predictions["movieId"].tolist()

In [47]:
ratings = pd.read_csv("data/ml-32m/ratings.csv")
movie_descs = pd.read_csv("data/movies_with_description.csv")
ratings = ratings[ratings["movieId"].isin(movie_descs["movieId"])]
ratings = ratings[ratings["rating"] >= 5.0]

In [None]:
users_to_analyze = [304, 6741, 147001]

preds = {u: [] for u in users_to_analyze}
preds_weighted = {u: [] for u in users_to_analyze}


for user in users_to_analyze:
    movies_watched = ratings[ratings["userId"] == user]["movieId"].tolist()

    preds[user] = predict_movies(G, movies_watched, weighted=True)
    preds_weighted[user] = predict_movies(G, movies_watched, weighted=True)

8
14
51


In [None]:
from sklearn.model_selection import train_test_split

# sample 1000 users that have at least 5 ratings
users_to_analyze = ratings["userId"].value_counts()[ratings["userId"].value_counts() >= 5].sample(1000).index

accuracies = []

K = 5
TEST_SIZE = 0.2

for user in users_to_analyze:
    movies_watched = ratings[ratings["userId"] == user]["movieId"].tolist()
    
    if len(movies_watched) < 5:
        continue
    
    train_movies, test_movies = train_test_split(movies_watched, test_size=TEST_SIZE, random_state=42)
    
    predicted_movies = predict_movies(G, train_movies, weighted=True)[:K]
    
    correct_predictions = any(movie in test_movies for movie in predicted_movies)
    
    accuracies.append(correct_predictions)

accuracy = sum(accuracies) / len(accuracies)
print(f"Accuracy: {accuracy:.2f}")

Index([159728,  75092, 200188, 181434,  17846,  79939,  13636,  99096,  57110,
        41467,
       ...
       112098, 166589, 171170,  28029,  40618, 171528, 181771,  90671,  93161,
        44792],
      dtype='int64', name='userId', length=1000)