In [12]:
import pandas as pd
import json
from glob import glob
import numpy as np
import re

from tgb.linkproppred.evaluate import Evaluator
from tgb.linkproppred.dataset import LinkPropPredDataset

In [13]:
# dataset = "tgbl-wiki"
# model = "gpt-4o-mini-2024-07-18"
# prompt_version = "base_v1_icl"

dataset = "tgbl-lastfm"
model = "gpt-4.1-mini-2025-04-14"
prompt_version = "base_v1_icl"

folder = f"./output/{dataset}/{model}/{prompt_version}"
answer_key = f"{folder}/answer_key.parquet"

result_folder = f"./result/{dataset}/{model}/{prompt_version}/output"

In [14]:
link_prop_dataset = LinkPropPredDataset(name=dataset, root="datasets", preprocess=True)
evaluator = Evaluator(name=dataset)
metric = link_prop_dataset.eval_metric
evaluator = Evaluator(name=dataset)

raw file found, skipping download
Dataset directory is  /Users/zacharyyang/anaconda3/lib/python3.10/site-packages/tgb/datasets/tgbl_lastfm
loading processed file


In [15]:
answer_key = pd.read_parquet(answer_key)

In [16]:
def extract_destination_node(text: str) -> int:
    """Extract the destination node from the model answer."""
    pattern = r'[-+]?\d*\.\d+|\d+$'  # Matches decimal or integer at end of string
    match = re.search(pattern, text)
    if match:
        return int(match.group())
    return -1

In [17]:
result = []
error_count = 0
empty_count = 0

for file in glob(f"{result_folder}/*.jsonl"):
    with open(file, "r") as f:
        for line in f:
            try:
                data = json.loads(line)
                id = data.get("custom_id")
                
                # Default value
                destination_node = -1
                    
                model_answer = data["response"]["body"]["choices"][0]["message"]["content"]

                if not model_answer:
                    empty_count +=1
                    result.append({"task_id": id, "destination_node": destination_node})
                    continue
                
                # Clean the model answer
                model_answer = str(model_answer).replace("\n", "").replace("\r ", "").replace("\t", "")
                
                if model_answer.strip() == "":
                    empty_count +=1
                    result.append({"task_id": id, "destination_node": destination_node})
                    continue
                
                try:
                    parsed_answer = json.loads(model_answer)
                    destination_node = parsed_answer["destination_node"]
                except Exception as e:
                    destination_node = extract_destination_node(model_answer)
                    error_count += 1
                    print(f"Error parsing JSON: {model_answer}")
                            
                result.append({"task_id": id, "destination_node": destination_node})
            
            except Exception as e:
                print(f"Error processing line: {line}")
                    

                    
result = pd.DataFrame(result)
print(f"Error count: {error_count}")
print(f"Empty count: {empty_count}")



Error count: 0
Empty count: 3


In [18]:
# Sanity Check
len(result), len(answer_key)

(193966, 193966)

In [19]:
answer_key = answer_key.merge(result, on="task_id")

In [20]:
def predict_link(query_dst: np.ndarray, llm_dst: int) -> np.ndarray:
    r"""
    convert LLM prediction into MRR format, just check if the LLM prediction is within the possible destinations
    """
    pred = np.zeros(len(query_dst))
    idx = 0
    for dst in query_dst:
        if (dst == llm_dst):
            pred[idx] = 1.0
        idx += 1
    return pred

def get_score(query_dst: np.ndarray, 
              llm_dst: int,
              evaluator,
              metric: str) -> float:
    r"""
    get the score of the LLM prediction
    """

    y_pred = predict_link(query_dst, llm_dst)
    input_dict = {
            "y_pred_pos": np.array([y_pred[0]]),
            "y_pred_neg": np.array(y_pred[1:]),
            "eval_metric": [metric],
        }
    return evaluator.eval(input_dict)[metric]
   

In [21]:
answer_key["score"] = answer_key.apply(
    lambda row: get_score(
        row["query_dst"][0],
        row["destination_node"],
        evaluator,
        metric
    ),
    axis=1
)

In [22]:
print(f"Average Score for {dataset} {prompt_version} is: {np.mean(answer_key['score']):.3f}")

Average Score for tgbl-lastfm base_v1_icl is: 0.065
