In [1]:
import pickle
import sys
import torch
from torch_geometric.data import Data
from torch_geometric.nn import LightGCN
from torch_geometric.utils import from_networkx

sys.path.append("../complexity_hunters/")  # to make utils importable
sys.path.append(".")  # to make utils importable
sys.path.append("..")  # to make utils importable

import utils.data_worker
import utils.consts

from graph.graph import build_graph
import igraph

from complexity_hunters.extra_metrics import sets_iou

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
graph = pickle.load(open("../data/graph.pkl", "rb"))

print("Converting graph to PyTorch Geometric format")
data = from_networkx(graph)
data.edge_index = data.edge_index.long()

Converting graph to PyTorch Geometric format


In [3]:
# Define LightGCN model
class RecommendationModel(torch.nn.Module):
    def __init__(self, num_users, num_questions, embedding_dim=64, num_layers=3):
        super().__init__()
        self.model = LightGCN(num_nodes=num_users + num_questions, num_layers=num_layers, embedding_dim=embedding_dim)
        self.user_embeddings = torch.nn.Embedding(num_users, embedding_dim)
        self.question_embeddings = torch.nn.Embedding(num_questions, embedding_dim)

    def forward(self, edge_index):
        return self.model(edge_index=edge_index)

In [4]:
user_nodes = [node for node in graph.nodes if graph.nodes[node]["type"] == "user"]
question_nodes = [node for node in graph.nodes if graph.nodes[node]["type"] == "question"]

user_mapping = {node: idx for idx, node in enumerate(user_nodes)}
question_mapping = {node: idx for idx, node in enumerate(question_nodes)}

In [5]:
edge_index = data.edge_index.clone()
for idx in range(edge_index.shape[1]):
    src, dst = edge_index[:, idx]
    if src in user_mapping and dst in question_mapping:
        edge_index[0, idx] = user_mapping[src]
        edge_index[1, idx] = question_mapping[dst]

num_users = len(user_nodes)
num_questions = len(question_nodes)
embedding_dim = 64

In [6]:
model = RecommendationModel(num_users, num_questions, embedding_dim)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

In [7]:
labels = torch.ones(edge_index.shape[1])

print("Training LightGCN")
for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    outputs = model(edge_index)
    loss = criterion(outputs.squeeze(), labels)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Training LightGCN
Epoch 1, Loss: 0.6929556727409363
Epoch 2, Loss: 0.690151035785675
Epoch 3, Loss: 0.684220016002655
Epoch 4, Loss: 0.6743250489234924
Epoch 5, Loss: 0.660207211971283
Epoch 6, Loss: 0.6417495012283325
Epoch 7, Loss: 0.6189597845077515
Epoch 8, Loss: 0.5919746160507202
Epoch 9, Loss: 0.5610700249671936
Epoch 10, Loss: 0.5266695618629456


In [8]:
posts = utils.data_worker.load_dataset(utils.consts.POSTS_DATA_PATH)

INFO: Loading dataset ../data/Posts.xml...


In [9]:

print("Making recommendations")
brand_new_question = posts[posts.PostTypeId == 1].sample()
brand_new_question_id = brand_new_question["Id"].values[0]
brand_new_question_idx = question_mapping["q" + str(brand_new_question_id)]

with torch.no_grad():
    model.eval()
    question_embedding = model.question_embeddings(torch.tensor([brand_new_question_idx]))
    user_embeddings = model.user_embeddings.weight
    scores = torch.matmul(user_embeddings, question_embedding.T).squeeze()

top_k = 5
recommended_users = scores.topk(top_k).indices
recommended_user_ids = [user_nodes[idx] for idx in recommended_users]
print(f"Recommended users for question {brand_new_question_id}: {recommended_user_ids}")


Making recommendations
Recommended users for question 14497: ['u6720', 'u6109', 'u1922', 'u6339', 'u7117']


In [10]:
badges = utils.data_worker.load_dataset(utils.consts.BADGES_DATA_PATH, debug_slice=False)
badges = badges[badges.UserId.isin(posts.OwnerUserId.unique())]

INFO: Loading dataset ../data/Badges.xml...


In [12]:
print(
    sum([sets_iou(
        set(badges[badges.UserId == int(user[1:])].Name),
        set(badges[badges.UserId == int(brand_new_question.OwnerUserId)].Name)
    ) for user in recommended_user_ids]) / len(recommended_user_ids)
)

0.62948051948051946


  set(badges[badges.UserId == int(brand_new_question.OwnerUserId)].Name)
