In [1]:
import os
import json
import pandas as pd
import random
import google.generativeai as genai
from graphs import get_nx_graph, verbalize_neighbors_triples_from_triples, verbalize_neighbors_triples_from_graph
from models import KnowledgeGraphLLM
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load text data
data = json.load(open('output/concept_abstracts.json', 'r'))

In [3]:
# load extracted triples
candidate_triples = []
for line in open(f'output/step-02-triplets.jsonl', 'r'):
    t = json.loads(line)
    candidate_triples.append((t['s'], t['p'], t['o']))
print(f'Loaded {len(candidate_triples)} triples.')


Loaded 2357 triples.


In [4]:

output_file = "output/prerequisite_of_graph.tsv"

with open(output_file, "w", encoding="utf-8") as f:
    for head, relation, tail in candidate_triples:
        if relation == 'Is-a-Prerequisite-of':
            f.write(f"{head}\tIs-a-Prerequisite-of\t{tail}\n")

print(f"✅ File saved as {output_file}")


✅ File saved as output/prerequisite_of_graph.tsv


In [12]:
prerequisite_of_triples = []
if os.path.exists('output/prerequisite_of_graph.tsv'):
    print(f"Loading annotated graph from output/prerequisite_of_graph.tsv.")
    with open('output/prerequisite_of_graph.tsv', 'r') as f:
        for line in f:
            s, p, o = line.strip().split('\t')
            prerequisite_of_triples.append((str(s).lower(), str(p), str(o).lower()))
else:
    print(
        f"No annotated graph found at output/prerequisite_of_graph.tsv. Proceeding without it.")

prerequisite_of_graph = get_nx_graph(prerequisite_of_triples, concept_2_id, relation_2_id)

Loading annotated graph from output/prerequisite_of_graph.tsv.


In [9]:
if 'output/refined_concepts.tsv' != None:
    print(
        f"Refined concepts specified. Loading concepts from output/refined_concepts.tsv.")
    id_2_concept = {i: str(c['concept']) for i, c in
                    pd.read_csv('output/refined_concepts.tsv', sep='|', header=None,
                                names=['id', 'concept'], index_col=0).iterrows()}

    print(
        f"Loaded {len(id_2_concept)} refined concepts, e.g. {', '.join(list(id_2_concept.values())[:3])}")
else:
    # randomly sample up to 100 concepts extracted in step 2
    concepts = [c[0] for c in candidate_triples] + [c[2] for c in candidate_triples]
    random.shuffle(concepts)
    print(
        f'No refined concepts specified. Randomly selected concepts: {", ".join(concepts[:100])}')
    id_2_concept = {i: c for i, c in enumerate(concepts)}

concept_2_id = {v: k for k, v in id_2_concept.items()}

Refined concepts specified. Loading concepts from output/refined_concepts.tsv.
Loaded 592 refined concepts, e.g. min heap, current, garbage collection algorithm


In [13]:
import csv

seen = set()  # to track unique concepts

with open("output/refined_concepts.tsv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="|")
    i = 1
    for key in concept_2_id.keys():
        concept = key.lower().strip()  # lowercase and remove extra spaces
        if concept not in seen:
            writer.writerow([i, concept])
            seen.add(concept)
            i += 1


In [14]:
# load relation types
relation_def = json.load(open('relation_types.json'))
relation_types = list(relation_def.keys())
relation_2_id = {v: k for k, v in enumerate(relation_types)}
id_2_relation = {k: v for k, v in enumerate(relation_types)}
relation_2_id

{'Compare': 0,
 'Part-of': 1,
 'Conjunction': 2,
 'Evaluate-for': 3,
 'Is-a-Prerequisite-of': 4,
 'Used-for': 5,
 'Hyponym-Of': 6}

In [93]:
for triple in prerequisite_of_triples:
    if triple[0]=='doubly linked list' or triple[2]=='doubly linked list':
        print(triple)

('doubly linked list', 'Is-a-Prerequisite-of', 'efficient searching')
('doubly linked list', 'Is-a-Prerequisite-of', 'reverse list traversal')
('doubly linked list', 'Is-a-Prerequisite-of', 'reverse list traversal')
('doubly linked list', 'Is-a-Prerequisite-of', 'reverse list traversal')
('doubly linked list', 'Is-a-Prerequisite-of', 'reverse list traversal')


In [15]:
with open('prompts/prompt_fusion.txt', "r", encoding="utf-8") as f:
    prompt_template_txt = f.read()

In [16]:
genai.configure(api_key="AIzaSyDOYcjnwJEOajSMrfoJsrCGMOU_HQrXjRM")
model = genai.GenerativeModel("gemini-1.5-flash")

In [17]:
import re
def parse_tuples(response_text):
    """Parse tuple format: (subject, predicate, object)"""
    # Regular expression to match tuples with proper handling of nested parentheses
    tuple_pattern = r'\(([^,()]+),\s*([^,()]+),\s*([^)]+)\)'
    matches = re.findall(tuple_pattern, response_text)
    
    triples = []
    for match in matches:
        subject, predicate, object_val = match
        # Clean up whitespace
        subject = subject.strip()
        predicate = predicate.strip()
        object_val = object_val.strip()
        
        # Create triple dictionary matching the expected format
        triple = {
            's': subject,
            'p': predicate, 
            'o': object_val
        }
        triples.append(triple)
    
    return triples


In [18]:
max_retries = 5
initial_backoff = 10
import time

def call_gemini_with_backoff(prompt):
    """Call Gemini API with exponential backoff on rate limits."""
    backoff = initial_backoff
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            if "429" in str(e):  # Rate limit hit
                print(f"Rate limit hit. Backing off for {backoff}s...")
                time.sleep(backoff)
                backoff *= 2  # exponential backoff
            else:
                print(f"Gemini API error: {e}")
                return None
    return None

In [None]:
import json
from tqdm import tqdm

output_file = 'output/step-03-fusion(aaaaaaaaaa).jsonl'
extracted_relations = []

with open(output_file, "w", encoding="utf-8") as output_stream:
    for id, candidate_concept in tqdm(id_2_concept.items(), total=len(id_2_concept)):
        # Local subgraph (only from candidate triples for this concept)
        candidate_subgraph = verbalize_neighbors_triples_from_triples(candidate_triples,
                                                                      candidate_concept)

        prerequisite_of_graph_subgraph = verbalize_neighbors_triples_from_graph(
            prerequisite_of_graph, candidate_concept, concept_2_id, id_2_concept, mode='outgoing')

        abstracts = ' '.join(
            data[candidate_concept]['abstracts']) if candidate_concept in data else ''


        # Build the final prompt
        prompt = prompt_template_txt.format(
            concept=candidate_concept,
            graph1=candidate_subgraph,
            graph2=prerequisite_of_graph_subgraph,   
            background=abstracts[:10000],
            relation_definitions="\n".join(
                f"{rel_type}: {rel_data['description']}"
                for rel_type, rel_data in relation_def.items()
            )
        )

        


        # Call Gemini with backoff
        response_text = call_gemini_with_backoff(prompt)

        if not response_text or response_text == "None":
            continue

        try:
            response_triples = parse_tuples(response_text)

            if not response_triples:
                print(f"No valid tuples found for concept {concept_name}")
                continue

            for triple in response_triples:
                # keep only valid relations
                if triple.get("p") not in relation_def:
                    continue
                else:
                    extracted_relations.append(triple["p"])

                # add metadata
                #triple["id"] = id
                #triple["concept"] = candidate_concept

                output_stream.write(json.dumps(triple) + "\n")

        except Exception as e:
            print(f"Error parsing tuples for concept {concept_name}: {e}")
            print(f"Problematic response: {response_text[:200]}...")
            continue

print("Step 3: Fusion completed")


In [None]:
import json
from tqdm import tqdm
from itertools import islice

output_file = 'output/step-03-fusion37.jsonl'
extracted_relations = []

# Slice the first 10 items of id_2_concept
id_2_concept_small = dict(islice(id_2_concept.items(), 370,380))

with open(output_file, "w", encoding="utf-8") as output_stream:
    for id, candidate_concept in tqdm(id_2_concept_small.items(), total=len(id_2_concept_small)):
        # Local subgraph (only from candidate triples for this concept)
        candidate_subgraph = verbalize_neighbors_triples_from_triples(candidate_triples,
                                                                      candidate_concept)

        prerequisite_of_graph_subgraph = verbalize_neighbors_triples_from_graph(
            prerequisite_of_graph, candidate_concept, concept_2_id, id_2_concept, mode='outgoing')

        abstracts = ' '.join(
            data[candidate_concept]['abstracts']) if candidate_concept in data else ''

        # Build the final prompt
        prompt = prompt_template_txt.format(
            concept=candidate_concept,
            graph1=candidate_subgraph,
            graph2=prerequisite_of_graph_subgraph,   
            background=abstracts[:10000],
            relation_definitions="\n".join(
                f"{rel_type}: {rel_data['description']}"
                for rel_type, rel_data in relation_def.items()
            )
        )

        #print(candidate_concept,prompt)

        # Call Gemini with backoff
        response_text = call_gemini_with_backoff(prompt)

        if not response_text or response_text == "None":
            continue

        try:
            response_triples = parse_tuples(response_text)

            if not response_triples:
                print(f"No valid tuples found for concept {candidate_concept}")
                continue

            for triple in response_triples:
                # keep only valid relations
                if triple.get("p") not in relation_def:
                    continue
                else:
                    extracted_relations.append(triple["p"])

                output_stream.write(json.dumps(triple) + "\n")

        except Exception as e:
            print(f"Error parsing tuples for concept {candidate_concept}: {e}")
            print(f"Problematic response: {response_text[:200]}...")
            continue

print("Step 3: Fusion completed")


In [None]:
#API-KEYS=
#AIzaSyBUbQVWBx2bc2ZAuz3rwtG-LWLGF-uTpx0
#AIzaSyCv-6t5AUHosbsYJXmugXIV8tsEgYZksgo
#AIzaSyCIxry94oa9avbAd-0O_t3TiPchxeKa5Ag
#AIzaSyDMzeM7B4sU0xvF4xsMEwxLBG_XxZHqV8U
#AIzaSyAmtY92J7OiI_YfNEiPCrTkVEabEkzXQX4
#AIzaSyCmnydUWgem2R-89_Atd2N78khiwsxPwH4
#AIzaSyDOYcjnwJEOajSMrfoJsrCGMOU_HQrXjRM


In [26]:
import json

# Input and output file paths
input_file = "output/step-03-fusion.jsonl"       # your original file
output_file = "output/step-03-fusion-unique.jsonl"  # new file with unique triplets

# Use a set to track unique triplets
unique = set()
unique_triplets = []

with open(input_file, "r", encoding="utf-8") as infile:
    for line in infile:
        triplet = json.loads(line.strip())
        # Create a tuple to check uniqueness
        triplet_key = (triplet["s"], triplet["p"], triplet["o"])
        
        if triplet_key not in unique:
            unique.add(triplet_key)
            unique_triplets.append(triplet)
        else:
            print(triplet_key)

print(len(unique_triplets))
# Write unique triplets back to a new .jsonl file
with open(output_file, "w", encoding="utf-8") as outfile:
    for triplet in unique_triplets:
        json.dump(triplet, outfile, ensure_ascii=False)
        outfile.write("\n")

print(f"✅ Done! Saved {len(unique_triplets)} unique triplets to {output_file}")


('pseudocode', 'Part-of', 'algorithm')
('algorithm', 'Used-for', 'problem solving')
('algorithm', 'Used-for', 'doubly linked list')
('algorithm', 'Part-of', 'computer science')
('algorithm', 'Used-for', 'list')
('algorithm', 'Used-for', 'data structures')
('data structures', 'Part-of', 'algorithm')
('algorithm', 'Conjunction', 'data structures')
('hash table', 'Part-of', 'algorithm')
('hash table', 'Used-for', 'algorithm')
('algorithm', 'Used-for', 'hash table')
('algorithm', 'Used-for', 'queue')
('queue', 'Used-for', 'algorithm')
('algorithm', 'Used-for', 'heap')
('heap', 'Used-for', 'priority queue')
('node', 'Part-of', 'binary search tree')
('ordered set', 'Part-of', 'binary search tree')
('ordered set', 'Compare', 'unordered set')
('hash table', 'Used-for', 'unordered set')
('unordered set', 'Compare', 'ordered set')
('ordered', 'Is-a-Prerequisite-of', 'ordered set')
('unordered', 'Is-a-Prerequisite-of', 'unordered set')
('ordered', 'Conjunction', 'unordered')
('ordered', 'Compare'