In [None]:
import networkx as nx 
import matplotlib.pyplot as plt 
import scipy as sp
import numpy as np
from matplotlib.lines import Line2D
import collections
import random
from networkx.algorithms.community.centrality import girvan_newman
from networkx.algorithms.community import kernighan_lin_bisection
import community as community_louvain
from networkx.algorithms.community import label_propagation_communities
from itertools import combinations
from collections import defaultdict
import pandas as pd
import ast

In [None]:
#1 Generate a user interaction network

interactions = pd.read_csv("RAW_interactions.csv")
interactions = interactions[['user_id', 'recipe_id', 'rating']]

# Due to the large volume of the dataset, we will get the first 50 unique users as an example
selected_users = interactions['user_id'].drop_duplicates().head(50).tolist()


filtered_interactions = interactions[interactions['user_id'].isin(selected_users)]
recipe_user_map = defaultdict(list)
for _, row in filtered_interactions.iterrows():
    recipe_user_map[row['recipe_id']].append((row['user_id'], row['rating']))

G = nx.Graph()
G.add_nodes_from(selected_users)

# Create edges based on shared recipes
edge_weights = defaultdict(int)
for users_ratings in recipe_user_map.values():
    for (user1, rating1), (user2, rating2) in combinations(users_ratings, 2):
        if user1 == user2:
            continue
        edge = tuple(sorted((user1, user2)))
        similarity = 1 - abs(rating1 - rating2) / 5
        edge_weights[edge] += similarity

for (user1, user2), weight in edge_weights.items():
    if weight >= 1:
        G.add_edge(user1, user2, weight=weight)

# Draw the graph
plt.figure(figsize=(14, 12))
pos = nx.shell_layout(G)
nx.draw_networkx_nodes(G, pos, node_size=300)
nx.draw_networkx_edges(G, pos, edge_color='gray')
nx.draw_networkx_labels(G, pos, font_size=9)
plt.title("User Interaction Network (First 50 Users)", fontsize=14)
plt.axis('off')
plt.show()



In [None]:
#2 Visualize and plot the degree distribution
#calculate centralities
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)


degree_vals = list(degree_centrality.values())
closeness_vals = list(closeness_centrality.values())
betweenness_vals = list(betweenness_centrality.values())

# Plotting
fig, ax = plt.subplots(1, 3, figsize=(18, 5))

ax[0].hist(degree_vals, bins=10, color='skyblue', edgecolor='black')
ax[0].set_title('Degree Centrality Distribution')
ax[0].set_xlabel('Degree Centrality')
ax[0].set_ylabel('Frequency')

ax[1].hist(closeness_vals, bins=10, color='lightgreen', edgecolor='black')
ax[1].set_title('Closeness Centrality Distribution')
ax[1].set_xlabel('Closeness Centrality')
ax[1].set_ylabel('Frequency')

ax[2].hist(betweenness_vals, bins=10, color='salmon', edgecolor='black')
ax[2].set_title('Betweenness Centrality Distribution')
ax[2].set_xlabel('Betweenness Centrality')
ax[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
#3 Detect communities within the user interaction network

# Louvain Community Detection
louvain_partition = community_louvain.best_partition(G)
louvain_colors = [louvain_partition[node] for node in G.nodes()]

pos = nx.shell_layout(G)
plt.figure(figsize=(10, 8))
nx.draw(G, pos, node_color=louvain_colors, with_labels=True, node_size=300, cmap=plt.cm.Set3)
plt.title("Louvain Community Detection")
plt.axis("off")
plt.show()

# Girvan-Newman Community Detection
gn_generator = girvan_newman(G)
top_level_communities = next(gn_generator)
gn_communities = [list(c) for c in top_level_communities]

gn_node_color_map = {}
for i, community in enumerate(gn_communities):
    for node in community:
        gn_node_color_map[node] = i
gn_colors = [gn_node_color_map[node] for node in G.nodes()]

plt.figure(figsize=(10, 8))
nx.draw(G, pos, node_color=gn_colors, with_labels=True, node_size=300, cmap=plt.cm.Set2)
plt.title("Girvan-Newman Community Detection")
plt.axis("off")
plt.show()

# Community Statistics

def compute_community_stats_lo(graph, partition):
    reverse_partition = defaultdict(list)
    for node, comm_id in partition.items():
        reverse_partition[comm_id].append(node)

    stats = []
    for comm_id, nodes in reverse_partition.items():
        subgraph = graph.subgraph(nodes)
        num_nodes = subgraph.number_of_nodes()
        num_edges = subgraph.number_of_edges()
        if num_nodes > 0:
            avg_degree = np.mean([d for _, d in subgraph.degree()])
        else:
            avg_degree = 0

        if nx.is_connected(subgraph) and num_nodes > 1:
            diameter = nx.diameter(subgraph)
        else:
            diameter = float('nan')
            
        if nx.is_connected(subgraph) and num_nodes > 1:
            avg_path = nx.average_shortest_path_length(subgraph)
        else:
            avg_path = float('nan')
        stats.append({
            'Community': comm_id,
            'Nodes': num_nodes,
            'Edges': num_edges,
            'Diameter': diameter,
            'Avg Path Length': avg_path,
            'Avg Degree': avg_degree
        })
    return pd.DataFrame(stats)

def compute_community_stats_gn(graph, communities):
    stats = []
    for i, nodes in enumerate(communities):
        subgraph = graph.subgraph(nodes)
        num_nodes = subgraph.number_of_nodes()
        num_edges = subgraph.number_of_edges()
        if num_nodes > 0:
            avg_degree = np.mean([d for _, d in subgraph.degree()])
        else:
            avg_degree = 0

        if nx.is_connected(subgraph) and num_nodes > 1:
            diameter = nx.diameter(subgraph)
        else:
            diameter = float('nan')
            
        if nx.is_connected(subgraph) and num_nodes > 1:
            avg_path = nx.average_shortest_path_length(subgraph)
        else:
            avg_path = float('nan')
        stats.append({
            'Community': f'Community {i+1}',
            'Nodes': num_nodes,
            'Edges': num_edges,
            'Diameter': diameter,
            'Avg Path Length': avg_path,
            'Avg Degree': avg_degree
        })
    return pd.DataFrame(stats)

#print the result
community_stats_lo = compute_community_stats_lo(G, louvain_partition)
print("\nCommunity Summary (Louvain):")
print(community_stats_lo)

community_stats_gn = compute_community_stats_gn(G, gn_communities)
print("\nCommunity Summary (Girvan-Newman):")
print(community_stats_gn)

In [None]:
#4 Analyze the assortativity of the network

#Calculate average rating per user
user_avg_rating = interactions.groupby('user_id')['rating'].mean().to_dict()

#Compute assortativity coefficient
nx.set_node_attributes(G, user_avg_rating, "avg_rating")
assortativity = nx.numeric_assortativity_coefficient(G, "avg_rating")
print(f"Assortativity Coefficient (by avg rating): {assortativity:.4f}")
rating_vals = [user_avg_rating.get(n, 0) for n in G.nodes()]
pos = nx.shell_layout(G)

plt.figure(figsize=(12, 10))
nodes = nx.draw_networkx_nodes(G, pos, node_color=rating_vals, cmap=plt.cm.plasma, node_size=300)
nx.draw_networkx_edges(G, pos, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=8)
plt.colorbar(nodes, label="User Average Rating")
plt.title("User Interaction Network Colored by Average Rating")
plt.axis("off")
plt.show()


In [None]:
#5

#Compute core number for each node
core_nums = nx.core_number(G)


nx.set_node_attributes(G, core_nums, "core")

plt.figure(figsize=(12, 10))
pos = nx.shell_layout(G)
node_colors = [core_nums[node] for node in G.nodes()]
nodes = nx.draw_networkx_nodes(G, pos, node_color=node_colors, cmap=plt.cm.viridis, node_size=300)
nx.draw_networkx_edges(G, pos, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=8)
plt.colorbar(nodes, label="Core Number")
plt.title("k-Core Decomposition of User Interaction Network")
plt.axis("off")
plt.show()

#Analyze node counts per core level
core_counts = pd.Series(list(core_nums.values())).value_counts().sort_index()
print("\nNumber of nodes per k-core:")
print(core_counts)

core_subgraphs = {}
core_stats = []

# Analyze each k-core
for k in sorted(set(core_nums.values())):
    subG = nx.k_core(G, k)
    degrees = [d for _, d in subG.degree()]
    clustering_coeffs = list(nx.clustering(subG).values())
    density = nx.density(subG)
    avg_degree = np.mean(degrees) if degrees else 0
    avg_clustering = np.mean(clustering_coeffs) if clustering_coeffs else 0

    core_stats.append({
        'k-core': k,
        'Nodes': subG.number_of_nodes(),
        'Edges': subG.number_of_edges(),
        'Density': density,
        'Avg Degree': avg_degree,
        'Avg Clustering': avg_clustering
    })


core_stats_df = pd.DataFrame(core_stats)
print(core_stats_df)


In [None]:
#6

#use the fist 50 as an example
df = pd.read_csv("PP_recipes.csv", nrows=50)


df['ingredient_tokens'] = df['ingredient_tokens'].apply(ast.literal_eval)
df['steps_tokens'] = df['steps_tokens'].apply(ast.literal_eval)
df['ingredient_ids'] = df['ingredient_ids'].apply(ast.literal_eval)
df['n_ingredients'] = df['ingredient_tokens'].apply(len)
df['n_steps'] = df['steps_tokens'].apply(len)
df['n_ingredient_ids'] = df['ingredient_ids'].apply(len)

fig, ax = plt.subplots(1, 3, figsize=(16, 5))

ax[0].hist(df['n_ingredients'], bins=10, color='skyblue', edgecolor='black')
ax[0].set_title('Number of Ingredient Tokens')
ax[0].set_xlabel('Count')
ax[0].set_ylabel('Recipes')

ax[1].hist(df['n_steps'], bins=10, color='lightgreen', edgecolor='black')
ax[1].set_title('Number of Step Tokens')
ax[1].set_xlabel('Count')
ax[1].set_ylabel('Recipes')

ax[2].hist(df['n_ingredient_ids'], bins=10, color='salmon', edgecolor='black')
ax[2].set_title('Number of Ingredient IDs')
ax[2].set_xlabel('Count')
ax[2].set_ylabel('Recipes')

plt.tight_layout()
plt.show()

In [None]:
#7

recipes_df = pd.read_csv("PP_recipes.csv", usecols=["id", "ingredient_ids"])
interactions_df = pd.read_csv("RAW_interactions.csv", usecols=["user_id", "recipe_id", "rating"])

#Merge datasets on recipe_id and id
recipes_df['ingredient_ids'] = recipes_df['ingredient_ids'].apply(ast.literal_eval)
merged_df = interactions_df.merge(recipes_df, left_on="recipe_id", right_on="id")


user_data = merged_df.groupby("user_id").agg({
    'recipe_id': list,
    'rating': list,
    'ingredient_ids': lambda x: [i for sublist in x for i in sublist]
}).reset_index()


user_data.rename(columns={
    'recipe_id': 'rated_recipes',
    'rating': 'rating_list',
    'ingredient_ids': 'ingredients'
}, inplace=True)


#export
user_data.to_csv("User_Data.csv", index=False)


user_data['num_rated'] = user_data['rated_recipes'].apply(len)
user_data['num_ingredients'] = user_data['ingredients'].apply(len)
user_data['avg_rating'] = user_data['rating_list'].apply(lambda x: sum(x) / len(x) if x else 0)

sampled_users = user_data.head(50)

# Plot distributions for first 50 users only
fig, ax = plt.subplots(1, 3, figsize=(16, 5))

ax[0].hist(sampled_users['num_rated'], bins=10, color='skyblue', edgecolor='black')
ax[0].set_title('Rated Recipes (First 50 Users)')
ax[0].set_xlabel('Recipes Rated')
ax[0].set_ylabel('Users')

ax[1].hist(sampled_users['num_ingredients'], bins=10, color='lightgreen', edgecolor='black')
ax[1].set_title('Total Ingredients (First 50 Users)')
ax[1].set_xlabel('Total Ingredients')
ax[1].set_ylabel('Users')

ax[2].hist(sampled_users['avg_rating'], bins=10, color='salmon', edgecolor='black')
ax[2].set_title('Average Rating (First 50 Users)')
ax[2].set_xlabel('Avg Rating')
ax[2].set_ylabel('Users')

plt.tight_layout()
plt.show()

In [None]:
#8

from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MultiLabelBinarizer

recipes_df = pd.read_csv("PP_recipes.csv", usecols=["id", "ingredient_ids"])
recipes_df['ingredient_ids'] = recipes_df['ingredient_ids'].apply(ast.literal_eval)

# Limit to a sample(100 as an example)
recipes_sample = recipes_df.head(100)


mlb = MultiLabelBinarizer()
ingredient_matrix = mlb.fit_transform(recipes_sample['ingredient_ids'])

ingredient_matrix = ingredient_matrix.astype(bool)
jaccard_similarity = 1 - pairwise_distances(ingredient_matrix, metric="jaccard")
similarity_df = pd.DataFrame(jaccard_similarity, index=recipes_sample['id'], columns=recipes_sample['id'])

#use the first receips as an example
top_similar = similarity_df.iloc[0].sort_values(ascending=False).iloc[1:6]
print("Top similar recipes to Recipe ID", similarity_df.index[0])
print(top_similar)

In [None]:
#9
G = nx.Graph()
recipe_ids = similarity_df.index.tolist()

G.add_nodes_from(recipe_ids)

# Add edges for similarity >= 0.2
for i in range(len(recipe_ids)):
    for j in range(i + 1, len(recipe_ids)):
        sim = similarity_df.iloc[i, j]
        if sim >= 0.2:
            G.add_edge(recipe_ids[i], recipe_ids[j], weight=sim)

# Louvain Community Detection
louvain_partition = community_louvain.best_partition(G)
louvain_communities = pd.Series(louvain_partition).value_counts()
print("Louvain Clustering")
print(f"Number of communities: {louvain_communities.count()}")
print("Sizes:", louvain_communities.to_dict())

# Girvan-Newman
gn_communities_gen = girvan_newman(G)
gn_top_level = next(gn_communities_gen)
gn_communities_list = [list(c) for c in gn_top_level]
print("\nGirvan-Newman Clustering")
print(f"Number of communities: {len(gn_communities_list)}")
print("Sizes:", [len(c) for c in gn_communities_list])

In [None]:
#10
from collections import Counter
recipes_df = pd.read_csv("PP_recipes.csv", usecols=["id", "ingredient_ids", "calorie_level"])
recipes_df['ingredient_ids'] = recipes_df['ingredient_ids'].apply(ast.literal_eval)

# Load and merge ratings
ratings_df = pd.read_csv("RAW_interactions.csv", usecols=["user_id", "recipe_id", "rating"])
recipe_ratings = ratings_df.groupby('recipe_id')['rating'].mean().reset_index()
recipes_df = recipes_df.merge(recipe_ratings, left_on='id', right_on='recipe_id', how='left')
recipes_df['rating'] = recipes_df['rating'].fillna(0)

# Analyse Louvian
recipes_df['cluster'] = recipes_df['id'].map(louvain_partition)
cluster_summary = []
for cluster_id, group in recipes_df.groupby("cluster"):
    all_ingredients = [i for lst in group['ingredient_ids'] for i in lst]
    top_ingredients = Counter(all_ingredients).most_common(5)
    top_cal_level = group['calorie_level'].mode().iloc[0] if not group['calorie_level'].isna().all() else 'N/A'
    avg_rating = group['rating'].mean()

    cluster_summary.append({
        "Cluster": cluster_id,
        "Num Recipes": len(group),
        "Top Ingredients": [i for i, _ in top_ingredients],
        "Most Common Calorie Level": top_cal_level,
        "Average Rating": avg_rating
    })

cluster_df = pd.DataFrame(cluster_summary)
print(cluster_df)

#plot
plt.figure(figsize=(10, 5))
plt.bar(cluster_df['Cluster'].astype(str), cluster_df['Average Rating'], color='lightblue')
plt.title('Average Rating per Cluster')
plt.xlabel('Cluster ID')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

calorie_dist = recipes_df.groupby(['cluster', 'calorie_level']).size().unstack(fill_value=0)
calorie_dist.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')
plt.title('Calorie Level Distribution by Cluster')
plt.xlabel('Cluster ID')
plt.ylabel('Number of Recipes')
plt.tight_layout()
plt.show()

In [None]:
#11
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import pairwise_distances, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm

recipes_df = pd.read_csv("PP_recipes.csv", usecols=["id", "ingredient_ids"])
interactions_df = pd.read_csv("RAW_interactions.csv", usecols=["user_id", "recipe_id", "rating"])

recipes_df['ingredient_ids'] = recipes_df['ingredient_ids'].apply(ast.literal_eval)
interactions_df = interactions_df[interactions_df['recipe_id'].isin(recipes_df['id'])]  

# Compute Jaccard Similarity Matrix
recipes_sample = recipes_df.head(100).copy()
mlb = MultiLabelBinarizer()
ingredient_matrix = mlb.fit_transform(recipes_sample['ingredient_ids']).astype(bool)

jaccard_similarity = 1 - pairwise_distances(ingredient_matrix, metric="jaccard")
similarity_df = pd.DataFrame(jaccard_similarity, index=recipes_sample['id'], columns=recipes_sample['id'])

# Merge for full data and train/test split
merged_df = interactions_df.merge(recipes_sample, left_on="recipe_id", right_on="id")
user_groups = merged_df.groupby("user_id")

# Build train/test split
train_rows, test_rows = [], []

for user, group in user_groups:
    if len(group) < 2:
        continue 
    train, test = train_test_split(group, test_size=0.2, random_state=42)
    train_rows.append(train)
    test_rows.append(test)

train_df = pd.concat(train_rows)
test_df = pd.concat(test_rows)

# evaluate 
def predict_rating(user_id, target_recipe_id):
    user_history = train_df[train_df['user_id'] == user_id]
    if user_history.empty or target_recipe_id not in similarity_df.index:
        return np.nan
    rated_recipes = user_history['recipe_id']
    ratings = user_history['rating']
    sims = []
    for recipe, rating in zip(rated_recipes, ratings):
        if recipe in similarity_df.columns:
            sim = similarity_df.at[target_recipe_id, recipe]
            sims.append((sim, rating))
    if not sims:
        return np.nan
    sims = sorted(sims, key=lambda x: x[0], reverse=True)
    n = sum(sim * rating for sim, rating in sims)
    d = sum(sim for sim, _ in sims)
    return n / d if d != 0 else np.nan

# Evaluate predictions on the test set
true_ratings, predicted_ratings = [], []
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    pred = predict_rating(row['user_id'], row['recipe_id'])
    if not np.isnan(pred):
        true_ratings.append(row['rating'])
        predicted_ratings.append(pred)


mae = mean_absolute_error(true_ratings, predicted_ratings)
rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))

print(f"\nEvaluation Results:")
print(f"MAE  = {mae:.4f}")
print(f"RMSE = {rmse:.4f}")


recommendations = {}

for user_id, group in train_df.groupby("user_id"):
    liked = group[group["rating"] >= 4]["recipe_id"]
    if liked.empty:
        continue

    rated = set(group["recipe_id"])
    sim_scores = similarity_df.loc[liked].mean(axis=0)
    sim_scores = sim_scores[~sim_scores.index.isin(rated)]  
    top_recs = sim_scores.sort_values(ascending=False).head(5).index.tolist()
    recommendations[user_id] = top_recs

#Show recommendation for the first 10 users
print("Recommendations for the frist 10 users")
for user, recs in list(recommendations.items())[:10]:
    print(f"User {user} ->Recommended: {recs}")

In [None]:
#12
df = pd.read_csv("RAW_interactions.csv", usecols=["user_id", "recipe_id", "rating"])

# Filter for active users and recipes(use the first 100 as an example)
active_users = df['user_id'].value_counts()[lambda x: x >= 5].index[:100] # Keeps only users who rated at least 5 recipes
active_recipes = df['recipe_id'].value_counts()[lambda x: x >= 3].index[:100]# Keeps only recipes that have been rated at least 3 times

df = df[df['user_id'].isin(active_users) & df['recipe_id'].isin(active_recipes)]


user_map = {uid: i for i, uid in enumerate(df["user_id"].unique())}
recipe_map = {rid: i for i, rid in enumerate(df["recipe_id"].unique())}
inv_user_map = {v: k for k, v in user_map.items()}
inv_recipe_map = {v: k for k, v in recipe_map.items()}

df["user_index"] = df["user_id"].map(user_map)
df["recipe_index"] = df["recipe_id"].map(recipe_map)


rating_matrix = df.pivot(index="user_index", columns="recipe_index", values="rating")
train_rows, test_rows = [], []
for user in rating_matrix.index:
    user_ratings = rating_matrix.loc[user].dropna()
    if len(user_ratings) < 2:
        continue
    train_idx, test_idx = train_test_split(user_ratings.index, test_size=0.2, random_state=42)
    train_rows.append(pd.Series(user_ratings[train_idx], name=user))
    test_rows.append(pd.Series(user_ratings[test_idx], name=user))

train_matrix = pd.DataFrame(train_rows).T
test_matrix = pd.DataFrame(test_rows).T

# Pearson similarity function
def pearson_similarity(u, v):
    common = train_matrix[u].dropna().index.intersection(train_matrix[v].dropna().index)
    if len(common) < 1:
        return 0
    ur = train_matrix.loc[common, u]
    vr = train_matrix.loc[common, v]
    n = ((ur - ur.mean()) * (vr - vr.mean())).sum()
    d = np.sqrt(((ur - ur.mean()) ** 2).sum()) * np.sqrt(((vr - vr.mean()) ** 2).sum())
    return n/ d if d != 0 else 0

# Predict rating
def predict_rating(user, item):
    if item not in train_matrix.index:
        return np.nan  
    numer, denom = 0, 0
    for other in train_matrix.columns:
        if other == user or item not in train_matrix.index or pd.isna(train_matrix.at[item, other]):
            continue
        sim = pearson_similarity(user, other)
        numer += sim * train_matrix.at[item, other]
        denom += abs(sim)
    return numer / denom if denom > 0 else np.nan


# Evaluate on test set
true_ratings, predicted_ratings = [], []

for user in test_matrix.columns:
    for item in test_matrix[user].dropna().index:
        actual = test_matrix.at[item, user]
        pred = predict_rating(user, item)
        if not np.isnan(pred):
            true_ratings.append(actual)
            predicted_ratings.append(pred)


mae = mean_absolute_error(true_ratings, predicted_ratings)
rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))

#print("\nCollaborative Filtering Evaluation:")
#print(f"MAE  = {mae:.4f}")
#print(f"RMSE = {rmse:.4f}")

# Generate top-10 recommendations
recommendations = {}

for user in train_matrix.columns:
    rated = train_matrix[user].dropna().index
    unrated = train_matrix.index.difference(rated)

    rec_scores = {}
    for item in unrated:
        total, sim_sum = 0, 0
        for other in train_matrix.columns:
            if other == user or pd.isna(train_matrix.loc[item, other]):
                continue
            sim = pearson_similarity(user, other)
            total += sim * train_matrix.loc[item, other]
            sim_sum += abs(sim)
        if sim_sum > 0:
            rec_scores[item] = total / sim_sum

    if rec_scores:
        top_items = sorted(rec_scores.items(), key=lambda x: x[1], reverse=True)[:10]
        recommendations[user] = [inv_recipe_map[i] for i, _ in top_items]

print("Recommendations for the frist 10 users")
for user_idx, recs in list(recommendations.items())[:10]:
    real_user_id = list(user_map.keys())[list(user_map.values()).index(user_idx)]
    print(f"User {real_user_id} -> Recommended Recipes: {recs}")
    

In [None]:
#13  Neural Collaborative Filtering (NCF)
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load and preprocess data(Use the first 5000 to increase speed)
df = pd.read_csv("RAW_interactions.csv", usecols=["user_id", "recipe_id", "rating"]).head(5000)
df = df[df['rating'].notna()]

user_map = {uid: i for i, uid in enumerate(df['user_id'].unique())}
item_map = {iid: i for i, iid in enumerate(df['recipe_id'].unique())}
df['user'] = df['user_id'].map(user_map)
df['item'] = df['recipe_id'].map(item_map)

train, test = train_test_split(df, test_size=0.2, random_state=42)

# PyTorch Dataset
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['item'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_ds = RatingsDataset(train)
test_ds = RatingsDataset(test)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=256)

# NCF model
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=32):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.fc1 = nn.Linear(emb_size * 2, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, users, items):
        u = self.user_emb(users)
        i = self.item_emb(items)
        x = torch.cat([u, i], dim=1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x).squeeze()

# Train model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(len(user_map), len(item_map)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

for epoch in range(5): 
    model.train()
    for users, items, ratings in train_loader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)
        optimizer.zero_grad()
        preds = model(users, items)
        loss = loss_fn(preds, ratings)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Train loss = {loss.item():.4f}")

# Evaluate on test set
model.eval()
all_preds, all_targets = [], []
with torch.no_grad():
    for users, items, ratings in test_loader:
        users, items = users.to(device), items.to(device)
        preds = model(users, items).cpu().numpy()
        all_preds.extend(preds)
        all_targets.extend(ratings.numpy())

rmse = mean_squared_error(all_targets, all_preds, squared=False)
print(f"\nNCF RMSE: {rmse:.4f}")


model.eval()
top_n = defaultdict(list)
item_indices = list(inv_recipe_map.keys())

for user in df['user'].unique():
    user_tensor = torch.tensor([user] * len(item_indices), dtype=torch.long).to(device)
    item_tensor = torch.tensor(item_indices, dtype=torch.long).to(device)

    with torch.no_grad():
        scores = model(user_tensor, item_tensor).cpu().numpy()

    rated_items = set(df[df['user'] == user]['item'])
    unrated_scores = [(i, score) for i, score in enumerate(scores) if item_indices[i] not in rated_items]

    top_items = sorted(unrated_scores, key=lambda x: x[1], reverse=True)[:5]
    top_n[user] = [inv_recipe_map[item_indices[i]] for i, _ in top_items]

# Show sample recommendations(the first 10 users)
print("Recommendations for the frist 10 users")
for user_id, recs in list(top_n.items())[:10]:
    real_user = list(user_map.keys())[list(user_map.values()).index(user_id)]
    print(f"User {real_user} → Recommended Recipes: {recs}")

# Final Analysis

## Deep Learning-Based Recommenders 

One of the recent state-of-the-art models is **Neural Collaborative Filtering (NCF)**


###  Evaluation of NCF Results

In our implementation of Neural Collaborative Filtering on the frist 5000 interactions:
- **Training loss** steadily decreased from **16.30 to 5.25** over 5 epochs.
- The model achieved a **test RMSE of 2.3437**, indicating a reasonable prediction error for the given data volume.

_**Note**: due to random weight initialization and shuffling during training, the values above may vary slightly across different runs_

This aligns with the original findings by He et al. (2017), which show that NCF effectively learns non-linear patterns in user-item interaction data. The declining loss demonstrates the model's convergence, and the RMSE suggests moderately accurate predictions under a limited data scenario.

Additionally, as noted in related literature, neural models benefit from:
- Longer training with more epochs
- Larger and denser datasets
- Enhanced user/item context through side features

These enhancements can further reduce error and improve recommendation quality.

**Reference**:  
He, X. et al. (2017). *Neural Collaborative Filtering*. WWW Conference.



---

##  Limitations


**Content-Based Filtering**:Feature limitation, struggles to introduce diverse or unexpected content 

**Collaborative Filtering**: Needs high user overlap, suffers from sparsity

**Pearson Similarity**: Assumes linearity, sensitive to outliers

**MAE / RMSE Evaluation**: Doesn't reflect ranking or user satisfaction

**Small Dataset Issues** :Low overlap hurts performance and recommendations




