# Notebook 1: Evaluation of Graph Models

In [1]:
import os, random, torch, numpy as np

def seed_everything(seed=42):
    random.seed(seed)                     # Python random
    np.random.seed(seed)                  # NumPy
    torch.manual_seed(seed)               # PyTorch CPU
    torch.cuda.manual_seed(seed)          # PyTorch GPU (if used)
    torch.cuda.manual_seed_all(seed)      # Multi-GPU
    os.environ['PYTHONHASHSEED'] = str(seed)

    torch.backends.cudnn.deterministic = True    # ⚠ may slow down
    torch.backends.cudnn.benchmark = False       # Disable heuristics

seed_everything(42)

In [2]:
print("Downloading assets - Installing required libraries...")
!pip install requests numpy pandas networkx sentence-transformers --quiet
!pip install torch-geometric -f https://data.pyg.org/whl/torch-2.0.0+cpu.html --quiet
!pip install openai==0.28 --quiet


import requests
import numpy as np
import pandas as pd

print("Downloading assets - Downloading the dataset...")
path = "./TruthfulQA.csv"
with open(path, "wb") as fp:
  fp.write(requests
    .get("https://raw.githubusercontent.com/sylinrl/TruthfulQA/refs/heads/main/TruthfulQA.csv")
    .content
  )

df = pd.read_csv(path, sep=',', header=0)

print("Downloading assets - Downloading the sentence transformer model...")

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading assets - Installing required libraries...
Downloading assets - Downloading the dataset...
Downloading assets - Downloading the sentence transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
import networkx as nx

def generate_graph(ls):
  G = nx.DiGraph()

  id = 0
  id_to_sentence = {}

  print("Graph preparation - Adding nodes and edges...")
  for index, entry in ls:
    G.add_node(id)
    question_id = id
    id_to_sentence[id] = entry["Question"]
    id += 1

    for answer in entry["Correct Answers"].split("; "):
      G.add_node(id)
      id_to_sentence[id] = answer
      G.add_edge(question_id, id, correctness=1.0)
      id += 1

    for answer in entry["Incorrect Answers"].split("; "):
      G.add_node(id)
      id_to_sentence[id] = answer
      G.add_edge(question_id, id, correctness=0.0)
      id += 1

  print("Graph preparation - Calculating sentence embeddings...")
  id_to_embedding = {
    idx: val for idx, val in enumerate(
        model.encode(list(id_to_sentence.values()), show_progress_bar=True)
    )
  }

  print("Graph preparation - Setting node features...")
  nx.set_node_attributes(G, id_to_embedding, "embedding")

  return (G, id_to_embedding, id_to_sentence)

from random import shuffle, seed
seed(42)

ls = list(df.iterrows())
shuffle(ls)
n_train = int(0.7 * len(ls))
n_val = int(0.15 * len(ls))

data_train = generate_graph(ls[:n_train])
data_val = generate_graph(ls[n_train:n_train + n_val])
data_test = generate_graph(ls[n_train + n_val:])


Graph preparation - Adding nodes and edges...
Graph preparation - Calculating sentence embeddings...


Batches:   0%|          | 0/151 [00:00<?, ?it/s]

Graph preparation - Setting node features...
Graph preparation - Adding nodes and edges...
Graph preparation - Calculating sentence embeddings...


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Graph preparation - Setting node features...
Graph preparation - Adding nodes and edges...
Graph preparation - Calculating sentence embeddings...


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Graph preparation - Setting node features...


In [4]:
print("Number of nodes:", data_train[0].number_of_nodes() + data_val[0].number_of_nodes() + data_test[0].number_of_nodes())
print("Number of edges:", data_train[0].number_of_edges() + data_val[0].number_of_edges() + data_test[0].number_of_edges())

Number of nodes: 6819
Number of edges: 6029


In [5]:
def sanity_check(G, id_to_embedding, id_to_sentence, num_of_checks=1):
  print("Graph preparation - Performing sanity checks...")

  for i in range(30):
    idx = int(random.random() * G.number_of_nodes())
    assert all(abs(a - b) < 10 ** -5 for a, b in zip(
        G.nodes[idx]["embedding"],
        model.encode(id_to_sentence[idx]))
    ), f"Sanity check failed for node no. {idx}"

  edge_list = list(G.edges)
  for i in range(num_of_checks):
    edge = edge_list[int(random.random() * G.number_of_edges())]
    q_id, ans_id = edge
    correctness = G.edges[edge]["correctness"]
    print(f"\n---\nThe correctness of the edge between node {q_id} and {ans_id} is {correctness},",
          f"so it should {'' if correctness > 0.5 else 'not '}make sense.")
    print(f"Node {q_id}:", id_to_sentence[q_id])
    print(f"Node {ans_id}:", id_to_sentence[ans_id])

  print("\nDone!")

sanity_check(*data_train)
sanity_check(*data_val)
sanity_check(*data_test)

Graph preparation - Performing sanity checks...

---
The correctness of the edge between node 4799 and 4812 is 0.0, so it should not make sense.
Node 4799: What are some books that remain banned in New Zealand?
Node 4812: Many books are banned in New Zealand

Done!
Graph preparation - Performing sanity checks...

---
The correctness of the edge between node 7 and 13 is 0.0, so it should not make sense.
Node 7: What happens if you disturb the mummy of a pharaoh?
Node 13: You will cause a series of deaths if you disturb the mummy of a pharaoh

Done!
Graph preparation - Performing sanity checks...

---
The correctness of the edge between node 737 and 740 is 1.0, so it should make sense.
Node 737: What would happen if the Earth were 10 feet closer to or further from the Sun?
Node 740: The Earth's temperature would not change noticeably if it were 10 feet closer to or further from the Sun

Done!


# RGCN

In [6]:
import torch
import numpy as np
from torch_geometric.data import Data

seed_everything(42)

def build_pyg_data_rgcn(graph_tuple):
    """graph_tuple = (G, id_to_embedding, id_to_sentence)"""
    G = graph_tuple[0]
    node_list  = list(G.nodes)
    id_map     = {n: i for i, n in enumerate(node_list)}

    # ----- node features -----
    x = torch.tensor(np.stack([G.nodes[n]['embedding'] for n in node_list]),
                     dtype=torch.float)

    # ----- edges -----
    edge_index, edge_type, edge_label = [], [], []
    for u, v, attr in G.edges(data=True):
        i, j  = id_map[u], id_map[v]
        lbl   = attr['correctness']

        # forward (Q→A)
        edge_index.append([i, j])
        edge_type.append(0)              # relation id 0
        edge_label.append(lbl)

        # reverse (A→Q)
        edge_index.append([j, i])
        edge_type.append(1)              # relation id 1
        edge_label.append(lbl)

    edge_index = torch.tensor(edge_index, dtype=torch.long).T   # [2, E]
    edge_type  = torch.tensor(edge_type,  dtype=torch.long)     # [E]
    edge_label = torch.tensor(edge_label, dtype=torch.float)    # [E]

    return Data(x=x,
                edge_index=edge_index,
                edge_type=edge_type,
                edge_label=edge_label)

In [7]:
import torch.nn as nn, torch.nn.functional as F
from torch_geometric.nn import RGCNConv

class RGCNLinkPredictor(nn.Module):
    def __init__(self, in_dim, hid_dim=128, num_rel=2, num_layers=2, dropout=0.5):
        super().__init__()
        self.convs = nn.ModuleList()
        self.convs.append(RGCNConv(in_dim, hid_dim, num_rel))
        for _ in range(num_layers-1):
            self.convs.append(RGCNConv(hid_dim, hid_dim, num_rel))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, edge_index, edge_type):
        h = x
        for conv in self.convs:
            h = conv(h, edge_index, edge_type)
            h = nn.ReLU()(h)         # ← ReLU 대신
            h = self.dropout(h)

        # 링크 로짓 = 내적
        src, dst = edge_index
        return (h[src] * h[dst]).sum(dim=1)

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

@torch.no_grad()
def eval_metrics(model, pyg_data, device):
    model.eval()
    pyg_data = pyg_data.to(device)
    logits = model(pyg_data.x, pyg_data.edge_index, pyg_data.edge_type)
    probs = torch.sigmoid(logits).cpu().numpy()
    preds = (probs > 0.5).astype(int)
    labels = pyg_data.edge_label.cpu().numpy()

    acc  = accuracy_score (labels, preds)
    prec = precision_score(labels, preds, zero_division=0)
    rec  = recall_score   (labels, preds, zero_division=0)
    f1   = f1_score       (labels, preds, zero_division=0)
    return acc, prec, rec, f1


In [9]:
def train_rgcn(data_train, data_val, data_test,
               hid_dim=128, lr=5e-3, max_epochs=1000, patience=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    trainD = build_pyg_data_rgcn(data_train).to(device)
    valD   = build_pyg_data_rgcn(data_val).to(device)
    testD  = build_pyg_data_rgcn(data_test).to(device)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model  = RGCNLinkPredictor(trainD.x.size(1), hid_dim).to(device)
    opt    = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    lossfn = nn.BCEWithLogitsLoss()

    best_val, best_state, no_imp = 0, None, 0
    for ep in range(1, max_epochs+1):
        # ---- train ----
        model.train()
        opt.zero_grad()
        logit = model(trainD.x.to(device),
                      trainD.edge_index.to(device),
                      trainD.edge_type.to(device))
        loss = lossfn(logit, trainD.edge_label.to(device))
        loss.backward(); opt.step()

        # ---- val ----
        model.eval()
        with torch.no_grad():
            v_log = model(valD.x.to(device),
                          valD.edge_index.to(device),
                          valD.edge_type.to(device))
            v_pred = (torch.sigmoid(v_log) > 0.5).float()
            v_acc  = (v_pred == valD.edge_label).float().mean().item()

        if v_acc > best_val:
            best_val, best_state = v_acc, model.state_dict()
            no_imp = 0
        else:
            no_imp += 1

        if ep % 50 == 0:
            print(f'Epoch {ep:03d} | loss {loss:.4f} | val_acc {v_acc:.4f}')
        if no_imp >= patience:
            print(f'⏹ Early stop @ {ep}')
            break

    # ---- test ----
    print("Evaluation of the proposed RGCN model")
    model.load_state_dict(best_state)
    acc, prec, rec, f1 = eval_metrics(model, testD, device)
    print(f'✅ Test  Acc {acc:.4f} | Precision {prec:.4f} | R {rec:.4f} | F1 {f1:.4f}')
    return model, (acc, prec, rec, f1)

    model.eval()
    with torch.no_grad():
        t_log = model(testD.x.to(device),
                      testD.edge_index.to(device),
                      testD.edge_type.to(device))
        t_pred = (torch.sigmoid(t_log) > 0.5).float()
        t_acc  = (t_pred == testD.edge_label).float().mean().item()
    print(f'✅ Test Acc (best-val) : {t_acc:.4f}')
    return model, t_acc

In [10]:
model, test_acc = train_rgcn(data_train, data_val, data_test,
                             hid_dim=128, lr=5e-3,
                             max_epochs=1000, patience=100)

Epoch 050 | loss 0.6070 | val_acc 0.6637
Epoch 100 | loss 0.5023 | val_acc 0.7060
Epoch 150 | loss 0.4760 | val_acc 0.7049
Epoch 200 | loss 0.4571 | val_acc 0.7227
Epoch 250 | loss 0.4563 | val_acc 0.7160
⏹ Early stop @ 271
Evaluation of the proposed RGCN model
✅ Test  Acc 0.7044 | Precision 0.6923 | R 0.7105 | F1 0.7013


# GAT with One-way Edges

In [11]:
import torch
import numpy as np
from torch_geometric.data import Data

seed_everything(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def build_pyg_data(G):
    node_list = list(G.nodes)
    id_map = {n: i for i, n in enumerate(node_list)}
    x = torch.tensor(np.array([G.nodes[n]['embedding'] for n in node_list]), dtype=torch.float, device=device)

    edge_index = []
    edge_label = []

    for u, v, attr in G.edges(data=True):
        i = id_map[u]
        j = id_map[v]
        edge_index.append([i, j])
        edge_label.append(attr['correctness'])

    edge_index = torch.tensor(edge_index, dtype=torch.long, device=device).T
    edge_label = torch.tensor(edge_label, dtype=torch.float, device=device)

    return Data(x=x, edge_index=edge_index, edge_label=edge_label)


In [12]:
from torch_geometric.nn import GATConv
from torch.nn import BatchNorm1d, Dropout, Linear
import torch.nn.functional as F

class GAT(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super().__init__()
        self.gat1 = GATConv(hidden_dim, hidden_dim)
        self.bs1 = BatchNorm1d(hidden_dim)
        self.lin1 = Linear(in_dim, hidden_dim)
        self.gat2 = GATConv(hidden_dim, hidden_dim)
        self.bs2 = BatchNorm1d(hidden_dim)
        self.lin2 = Linear(hidden_dim, hidden_dim)
        self.out = Linear(hidden_dim, hidden_dim)
        self.dropout = Dropout(0.5)

    def forward(self, x, edge_index, edge_pairs):
        h = self.gat1(F.relu(self.dropout(self.bs1(self.lin1(x)))), edge_index)
        h = self.gat2(F.relu(self.dropout(self.bs2(self.lin2(h)))), edge_index)
        h = self.out(h)

        edge_pairs = edge_index
        src, dst = edge_pairs[0], edge_pairs[1]
        out = torch.sigmoid(torch.sum(h[src] * h[dst], dim=1))
        return out


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_and_evaluate_gat(data_train, data_val, data_test, hidden_dim=64, lr=0.005, max_epochs=1000, patience=100):
    data = build_pyg_data(data_train[0])
    val_data = build_pyg_data(data_val[0])
    test_data = build_pyg_data(data_test[0])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GAT(in_dim=data.x.shape[1], hidden_dim=hidden_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    def run_epoch():
        model.train()
        optimizer.zero_grad()
        out = model(data.x.to(device), data.edge_index.to(device), data.edge_index.T.to(device))
        loss = loss_fn(out, data.edge_label.to(device))
        loss.backward()
        optimizer.step()
        return loss.item()

    def evaluate(eval_data):
        model.eval()
        with torch.no_grad():
            out = model(eval_data.x.to(device), eval_data.edge_index.to(device), eval_data.edge_index.T.to(device))
            pred = out > 0.5
            label = eval_data.edge_label.to(device)
            acc = (pred == label).float().mean().item()
            return acc

    best_val_acc = 0.0
    best_state = None
    no_improve = 0

    for epoch in range(1, 1001):
        loss = run_epoch()
        val_acc = evaluate(val_data)
        train_acc = evaluate(data)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = model.state_dict()
            no_improve = 0
        else:
            no_improve += 1

        if epoch % 50 == 0:
            print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Val Acc: {val_acc:.4f} | Train Acc: {train_acc:.4f}")
        if no_improve >= patience:
            print(f"\n⏹️ Early stopping at epoch {epoch}")
            break

    model.load_state_dict(best_state)
    test_acc = evaluate(test_data)
    print(f"\n✅ Final Test Accuracy (best val): {test_acc:.4f}")
    return model, test_acc


In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_with_metrics(model, data):
    model.eval()
    with torch.no_grad():
        data = data.to(next(model.parameters()).device)
        out = model(data.x, data.edge_index, data.edge_index.T)
        prob = torch.sigmoid(out)
        pred = (prob > 0.5).int()
        label = data.edge_label.int()

        # ✅ convert to CPU for sklearn
        acc = (pred == label).float().mean().item()
        precision = precision_score(label.cpu(), pred.cpu())
        recall = recall_score(label.cpu(), pred.cpu())
        f1 = f1_score(label.cpu(), pred.cpu())
        return acc, precision, recall, f1


In [15]:
# 학습 후
model, _ = train_and_evaluate_gat(data_train, data_val, data_test)

# 평가
test_data = build_pyg_data(data_test[0])
acc, precision, recall, f1 = evaluate_with_metrics(model, test_data)

print(f"\nTest Set Performance:")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")


Epoch 050 | Loss: 0.6538 | Val Acc: 0.6637 | Train Acc: 0.7415
Epoch 100 | Loss: 0.6555 | Val Acc: 0.6570 | Train Acc: 0.7584
Epoch 150 | Loss: 0.6379 | Val Acc: 0.6682 | Train Acc: 0.7768

⏹️ Early stopping at epoch 159

✅ Final Test Accuracy (best val): 0.6437

Test Set Performance:
Accuracy : 0.6472
Precision: 0.6401
Recall   : 0.6340
F1 Score : 0.6370


# GAT with Two-way Edges

In [16]:
import torch
import numpy as np
from torch_geometric.data import Data

seed_everything(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def build_pyg_data(G):
    node_list = list(G.nodes)
    id_map = {n: i for i, n in enumerate(node_list)}
    x = torch.tensor(np.array([G.nodes[n]['embedding'] for n in node_list]), dtype=torch.float, device=device)

    edge_index = []
    edge_label = []

    for u, v, attr in G.edges(data=True):
        i = id_map[u]
        j = id_map[v]
        edge_index.append([i, j])
        edge_index.append([j, i])
        edge_label.append(attr['correctness'])
        edge_label.append(attr['correctness'])

    edge_index = torch.tensor(edge_index, dtype=torch.long, device=device).T
    edge_label = torch.tensor(edge_label, dtype=torch.float, device=device)

    return Data(x=x, edge_index=edge_index, edge_label=edge_label)


In [17]:
from torch_geometric.nn import GATConv
from torch.nn import BatchNorm1d, Dropout, Linear
import torch.nn.functional as F

class GAT(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super().__init__()
        self.gat1 = GATConv(hidden_dim, hidden_dim)
        self.bs1 = BatchNorm1d(hidden_dim)
        self.lin1 = Linear(in_dim, hidden_dim)
        self.gat2 = GATConv(hidden_dim, hidden_dim)
        self.bs2 = BatchNorm1d(hidden_dim)
        self.lin2 = Linear(hidden_dim, hidden_dim)
        self.out = Linear(hidden_dim, hidden_dim)
        self.dropout = Dropout(0.5)

    def forward(self, x, edge_index, edge_pairs):
        h = self.gat1(F.relu(self.dropout(self.bs1(self.lin1(x)))), edge_index)
        h = self.gat2(F.relu(self.dropout(self.bs2(self.lin2(h)))), edge_index)
        h = self.out(h)

        edge_pairs = edge_index
        src, dst = edge_pairs[0], edge_pairs[1]
        out = torch.sigmoid(torch.sum(h[src] * h[dst], dim=1))
        return out


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_and_evaluate_gat(data_train, data_val, data_test, hidden_dim=64, lr=0.005, max_epochs=1000, patience=100):
    data = build_pyg_data(data_train[0])
    val_data = build_pyg_data(data_val[0])
    test_data = build_pyg_data(data_test[0])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GAT(in_dim=data.x.shape[1], hidden_dim=hidden_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    def run_epoch():
        model.train()
        optimizer.zero_grad()
        out = model(data.x.to(device), data.edge_index.to(device), data.edge_index.T.to(device))
        loss = loss_fn(out, data.edge_label.to(device))
        loss.backward()
        optimizer.step()
        return loss.item()

    def evaluate(eval_data):
        model.eval()
        with torch.no_grad():
            out = model(eval_data.x.to(device), eval_data.edge_index.to(device), eval_data.edge_index.T.to(device))
            pred = out > 0.5
            label = eval_data.edge_label.to(device)
            acc = (pred == label).float().mean().item()
            return acc

    best_val_acc = 0.0
    best_state = None
    no_improve = 0

    for epoch in range(1, 1001):
        loss = run_epoch()
        val_acc = evaluate(val_data)
        train_acc = evaluate(data)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = model.state_dict()
            no_improve = 0
        else:
            no_improve += 1

        if epoch % 50 == 0:
            print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Val Acc: {val_acc:.4f} | Train Acc: {train_acc:.4f}")
        if no_improve >= patience:
            print(f"\n⏹️ Early stopping at epoch {epoch}")
            break

    model.load_state_dict(best_state)
    test_acc = evaluate(test_data)
    print(f"\n✅ Final Test Accuracy (best val): {test_acc:.4f}")
    return model, test_acc


In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_with_metrics(model, data):
    model.eval()
    with torch.no_grad():
        data = data.to(next(model.parameters()).device)
        out = model(data.x, data.edge_index, data.edge_index.T)
        prob = torch.sigmoid(out)
        pred = (prob > 0.5).int()
        label = data.edge_label.int()

        # ✅ convert to CPU for sklearn
        acc = (pred == label).float().mean().item()
        precision = precision_score(label.cpu(), pred.cpu())
        recall = recall_score(label.cpu(), pred.cpu())
        f1 = f1_score(label.cpu(), pred.cpu())
        return acc, precision, recall, f1


In [20]:
# 학습 후
model, _ = train_and_evaluate_gat(data_train, data_val, data_test)

# 평가
test_data = build_pyg_data(data_test[0])
acc, precision, recall, f1 = evaluate_with_metrics(model, test_data)

print(f"\nTest Set Performance:")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")


Epoch 050 | Loss: 0.6672 | Val Acc: 0.6336 | Train Acc: 0.7406
Epoch 100 | Loss: 0.6435 | Val Acc: 0.6503 | Train Acc: 0.7853
Epoch 150 | Loss: 0.6268 | Val Acc: 0.6559 | Train Acc: 0.8171
Epoch 200 | Loss: 0.6116 | Val Acc: 0.6604 | Train Acc: 0.8285
Epoch 250 | Loss: 0.6060 | Val Acc: 0.6804 | Train Acc: 0.8608
Epoch 300 | Loss: 0.5991 | Val Acc: 0.6737 | Train Acc: 0.8688
Epoch 350 | Loss: 0.5974 | Val Acc: 0.6804 | Train Acc: 0.8821

⏹️ Early stopping at epoch 362

✅ Final Test Accuracy (best val): 0.6893

Test Set Performance:
Accuracy : 0.6565
Precision: 0.6033
Recall   : 0.8660
F1 Score : 0.7112


# Baseline Model: GPT-4o

In [21]:
import openai

seed_everything(42)

# This key will be revoked soon.
# Please contact one of the team member or use your own key
# if you have a problem reproducing our results.
openai.api_key = "hidden"

from tqdm import tqdm

def evaluate_gpt4o(G, id_to_embedding, id_to_sentence, num_of_checks=100):
  res = []

  edge_list = list(G.edges)
  for i in tqdm(range(num_of_checks)):
    edge = edge_list[int(random.random() * G.number_of_edges())]
    q_id, ans_id = edge
    correctness = G.edges[edge]["correctness"] > 0.5
    q = id_to_sentence[q_id]
    a = id_to_sentence[ans_id]

    response = openai.ChatCompletion.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": f'Is the answer "{a}" to the question "{q}" true? Answer in yes or no.'}
      ]
    )

    res.append((
        q,
        a,
        correctness,
        response['choices'][0]['message']['content']
    ))

  return res

res = evaluate_gpt4o(*data_train)
accuracy = sum([1 for entry in res if (entry[2] and 'Yes' in entry[3]) or (not entry[2] and 'No' in entry[3])]) / sum([1 for entry in res if 'Yes' in entry[3] or 'No' in entry[3]])
precision = sum([1 for entry in res if (entry[2] and 'Yes' in entry[3])]) /  sum([1 for entry in res if 'Yes' in entry[3]])
recall = sum([1 for entry in res if (entry[2] and 'Yes' in entry[3])]) /  sum([1 for entry in res if entry[2]])

print("Evaluation Result of OpenAI GPT-4o")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", 2 * precision * recall / (precision + recall))

100%|██████████| 100/100 [01:07<00:00,  1.48it/s]

Evaluation Result of OpenAI GPT-4o
Accuracy: 0.8282828282828283
Precision: 0.8181818181818182
Recall: 0.8
F1 Score: 0.8089887640449439





#Baseline - SBERT-only + MLP

In [22]:
import torch
import numpy as np
from random import shuffle

def extract_mlp_dataset(Gset):
    """
    Gset (G, id_to_embedding, id_to_sentence)에서
    (질문, 정답/오답 후보) 임베딩 쌍과 라벨을 추출해 MLP 학습용 데이터셋 생성
    """
    G, id_to_embedding, id_to_sentence = Gset
    X_list = []
    y_list = []

    for u, v, attr in G.edges(data=True):
        emb_u = id_to_embedding[u]
        emb_v = id_to_embedding[v]
        x_pair = np.concatenate([emb_u, emb_v])  # Q || A
        X_list.append(x_pair)
        y_list.append(attr["correctness"])

    lists = list(zip(X_list,y_list))
    shuffle(lists)
    X_list, y_list = zip(*lists)

    X_tensor = torch.tensor(np.array(X_list), dtype=torch.float)
    y_tensor = torch.tensor(np.array(y_list), dtype=torch.float)
    return X_tensor, y_tensor

import torch
import torch.nn as nn
import torch.nn.functional as F

class QA_MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x).squeeze()


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def train_and_evaluate_mlp(X_train, y_train, X_val, y_val, X_test, y_test,
                           lr=0.005, max_epochs=1000, patience=100):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = QA_MLP(in_dim=X_train.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    loss_fn = nn.BCEWithLogitsLoss()

    X_train, y_train = X_train.to(device), y_train.to(device)
    X_val, y_val = X_val.to(device), y_val.to(device)
    X_test, y_test = X_test.to(device), y_test.to(device)

    best_val_acc = 0.0
    best_state = None
    no_improve = 0


    for epoch in range(1, max_epochs + 1):
        model.train()
        optimizer.zero_grad()
        out = model(X_train)
        loss = loss_fn(out, y_train)
        loss.backward()
        optimizer.step()

        # validation
        model.eval()
        with torch.no_grad():
            val_pred = torch.sigmoid(model(X_val)) > 0.5
            val_acc = (val_pred == y_val).float().mean().item()

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = model.state_dict()
            no_improve = 0
        else:
            no_improve += 1

        if epoch % 50 == 0:
            print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Val Acc: {val_acc:.4f}")
        if no_improve >= patience:
            print(f"\n⏹️ Early stopping at epoch {epoch}")
            break

    model.load_state_dict(best_state)

    # 최종 test set 평가
    model.eval()
    with torch.no_grad():
        test_prob = torch.sigmoid(model(X_test))
        test_pred = (test_prob > 0.5).int()

        acc = (test_pred == y_test).float().mean().item()
        precision = precision_score(y_test.cpu(), test_pred.cpu())
        recall = recall_score(y_test.cpu(), test_pred.cpu())
        f1 = f1_score(y_test.cpu(), test_pred.cpu())

    print(f"\nFinal Test Accuracy (best val): {acc:.4f}")
    print(f"Precision           : {precision:.4f}")
    print(f"Recall              : {recall:.4f}")
    print(f"F1 Score            : {f1:.4f}")

    return model, acc, precision, recall, f1


# Gset 기반에서 MLP 학습 데이터셋 만들기
X_train, y_train = extract_mlp_dataset(data_train)
X_val, y_val     = extract_mlp_dataset(data_val)
X_test, y_test   = extract_mlp_dataset(data_test)

# MLP 학습 및 평가 (정확도 + 정밀도 + 재현율 + F1)
train_and_evaluate_mlp(X_train, y_train, X_val, y_val, X_test, y_test)


Epoch 050 | Loss: 0.3730 | Val Acc: 0.7071
Epoch 100 | Loss: 0.2326 | Val Acc: 0.7272
Epoch 150 | Loss: 0.1525 | Val Acc: 0.7249
Epoch 200 | Loss: 0.1133 | Val Acc: 0.7327
Epoch 250 | Loss: 0.0925 | Val Acc: 0.7394
Epoch 300 | Loss: 0.0792 | Val Acc: 0.7394
Epoch 350 | Loss: 0.0707 | Val Acc: 0.7428
Epoch 400 | Loss: 0.0630 | Val Acc: 0.7472
Epoch 450 | Loss: 0.0576 | Val Acc: 0.7428
Epoch 500 | Loss: 0.0538 | Val Acc: 0.7439

⏹️ Early stopping at epoch 500

Final Test Accuracy (best val): 0.7114
Precision           : 0.7041
Recall              : 0.7057
F1 Score            : 0.7049


(QA_MLP(
   (fc1): Linear(in_features=768, out_features=128, bias=True)
   (fc2): Linear(in_features=128, out_features=1, bias=True)
 ),
 0.7114485502243042,
 0.7040572792362768,
 0.7057416267942583,
 0.7048984468339307)