In [4]:
import json

file1 = "final_hyper_gpt4.json"
file2 = "final_hyper.json"

with open(file1, 'r') as f1, open(file2, 'r') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)

# Safe stringify function for entries
def stringify_entry_safe(entry):
    abstract = entry.get('abstract', '')
    if abstract:
        abstract = abstract.replace('\n', ' ').strip()
    return (
        f"paperId={entry.get('paperId', '')} | "
        f"pmid={entry.get('pmid', '')} | "
        f"year={entry.get('year', '')} | "
        f"citation_count={entry.get('citation_count', '')} | "
        f"title={entry.get('title', '').strip()} | "
        f"abstract={abstract}"
    )

def get_all_stringified_chains_safe(data):
    chains = set()
    for item in data:
        for entry in item.get("original_chain", []):
            chains.add(stringify_entry_safe(entry))
    return chains

def get_stringified_entry_map(data):
    entry_map = {}
    for item in data:
        for entry in item.get("original_chain", []):
            stringified = stringify_entry_safe(entry)
            entry_map[stringified] = entry
    return entry_map

map1 = get_stringified_entry_map(data1)
map2 = get_stringified_entry_map(data2)
matching_keys = set(map1.keys()) & set(map2.keys())
len(matching_keys)

556

In [9]:
file_name_lookup_data1 = {item["metadata"]["file_name"]: item for item in data1}
matched_items = []
for item1 in data2:
    file_name = item1["metadata"].get("file_name")
    item2 = file_name_lookup_data1.get(file_name)
    if item2:
        matched_items.append((file_name, item1["generated_research_idea"], item2["generated_research_idea"]))


In [20]:
len(matched_items)

131

In [None]:
matched_items = []

for item1 in data2:
    file_name = item1["metadata"].get("file_name")
    item2 = file_name_lookup_data1.get(file_name)
    if item2:
        matched_items.append({
            "file_name": file_name,
            "gpt4_generated_idea": item2.get("generated_research_idea"),
            "gpt4_scores": item2.get("scores"),
            "hyper_generated_idea": item1.get("generated_research_idea"),
            "hyper_scores": item1.get("scores")
        })

print(json.dumps(matched_items[0], indent=2))

In [27]:
matched_items = []

for item1 in data2:
    file_name = item1["metadata"].get("file_name")
    chain_label = item1["metadata"].get("chain_label")
    
    if chain_label != "valid":
        continue 

    item2 = file_name_lookup_data1.get(file_name)
    if item2:
        matched_items.append({
            "file_name": file_name,
            "gpt4_generated_idea": item2.get("generated_research_idea"),
            "gpt4_scores": item2.get("scores"),
            "gpt4_novelty": item2.get("novelty_analysis").get("novelty_result"),
            "hyper_generated_idea": item1.get("generated_research_idea"),
            "hyper_scores": item1.get("scores"),
            "hyper_novelty": item1.get("novelty_analysis").get("novelty_result")
        })

print(len(matched_items))
# Example output
if matched_items:
    print(json.dumps(matched_items[0], indent=2))
else:
    print("No valid matched items found.")


31
{
  "file_name": "temporal_chain_CD005158_p-1.json",
  "gpt4_generated_idea": {
    "Analysis": "\n        \"related paper 0\": \"This paper builds upon the source paper by evaluating the optimal duration of dual antiplatelet therapy (DAPT) with clopidogrel and aspirin in patients with acute coronary syndrome (ACS). It extends the findings of the source paper by examining the effects of different durations of DAPT on ischemic events and bleeding, providing real-world data from a large cohort.\",\n        \"related paper 1\": \"This paper discusses the necessity of prolonged DAPT in patients with drug-eluting stents, building on the previous findings by questioning the universal application of extended DAPT. It highlights the need for personalized treatment durations based on individual patient risk profiles, addressing the balance between reducing ischemic events and minimizing bleeding risks.\",\n        \"related paper 2\": \"This paper explores the perioperative challenges of man

In [28]:
from collections import defaultdict

# Initialize score accumulators and counters
gpt4_totals = defaultdict(float)
hyper_totals = defaultdict(float)
gpt4_count = 0
hyper_count = 0

# Dimensions to evaluate
dimensions = ["clarity", "relevance", "originality", "feasibility", "significance"]

# Aggregate scores
for match in matched_items:
    gpt4_scores = match.get("gpt4_scores", {})
    hyper_scores = match.get("hyper_scores", {})

    for dim in dimensions:
        if dim in gpt4_scores:
            gpt4_totals[dim] += gpt4_scores[dim]
        if dim in hyper_scores:
            hyper_totals[dim] += hyper_scores[dim]

    if gpt4_scores:
        gpt4_count += 1
    if hyper_scores:
        hyper_count += 1

# Compute averages
comparison_table = []
for dim in dimensions:
    gpt4_avg = round(gpt4_totals[dim] / gpt4_count, 2) if gpt4_count else None
    hyper_avg = round(hyper_totals[dim] / hyper_count, 2) if hyper_count else None
    comparison_table.append({
        "Dimension": dim.capitalize(),
        "GPT-4 Score": gpt4_avg,
        "HypER Score": hyper_avg,
    })

# Print as a table-like format
import pandas as pd
df = pd.DataFrame(comparison_table)
print(df.to_string(index=False))


   Dimension  GPT-4 Score  HypER Score
     Clarity         4.00         3.35
   Relevance         3.96         3.35
 Originality         3.00         3.01
 Feasibility         3.53         3.15
Significance         3.84         3.37


In [30]:
import json
from collections import Counter

# Load novelty scores
def extract_novelty_score(novelty_str):
    try:
        parsed = json.loads(novelty_str.strip())
        return parsed.get("Novelty score", "").strip().lower()
    except Exception as e:
        return None

# Initialize
score_pairs = []
gpt4_counts = Counter()
hyper_counts = Counter()
agreement = 0

# Assume matched_items is already loaded
for item in matched_items:
    gpt4_score = extract_novelty_score(item.get("gpt4_novelty", ""))
    hyper_score = extract_novelty_score(item.get("hyper_novelty", ""))

    if gpt4_score and hyper_score:
        score_pairs.append((gpt4_score, hyper_score))
        gpt4_counts[gpt4_score] += 1
        hyper_counts[hyper_score] += 1
        if gpt4_score == hyper_score:
            agreement += 1

total = len(score_pairs)

# Agreement %
agreement_percent = (agreement / total) * 100 if total else 0

# Precision/Recall-style Metrics (treat GPT-4 "novel" as ground truth)
tp = sum(1 for g, h in score_pairs if g == "novel" and h == "novel")
fp = sum(1 for g, h in score_pairs if g != "novel" and h == "novel")
fn = sum(1 for g, h in score_pairs if g == "novel" and h != "novel")

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Output
print("\n--- Novelty Score Analysis ---")
print(f"Total matched items: {total}")
print(f"Agreement count: {agreement}")
print(f"Agreement %: {agreement_percent:.2f}%\n")

print("GPT-4 Novelty Distribution:", dict(gpt4_counts))
print("HypER Novelty Distribution:", dict(hyper_counts))

print("\n--- Precision/Recall (GPT-4 'novel' as ground truth) ---")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")



--- Novelty Score Analysis ---
Total matched items: 31
Agreement count: 13
Agreement %: 41.94%

GPT-4 Novelty Distribution: {'novel': 19, 'not novel': 12}
HypER Novelty Distribution: {'novel': 18, 'not novel': 12, 'unknown': 1}

--- Precision/Recall (GPT-4 'novel' as ground truth) ---
Precision: 0.56
Recall: 0.53
F1 Score: 0.54
