In [1]:
import os
import json
import time
import logging
from tqdm import tqdm
import google.generativeai as genai
import re

# LangChain imports for Claude
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
output_file = 'output/step-02-triplets.jsonl'
data = json.load(open('output/concept_abstracts.json', 'r'))
relation_def = json.load(open('test/relation_types.json'))

with open('prompts/prompt_tpextraction.txt', "r", encoding="utf-8") as f:
    prompt_template = f.read()

#data = dict(list(data.items())[50:51])   # take first 2 key-value pairs



In [4]:
len(data)

450

In [7]:
def build_prompt(concept_name, abstracts):
    abstracts_text = " ".join(abstracts)[:10000]
    relations_text = "\n".join([f"{k}: {v['description']}" for k, v in relation_def.items()])

    return prompt_template.format(
        concepts=concept_name,
        abstracts=abstracts_text,
        relation_definitions=relations_text
    )

In [54]:
genai.configure(api_key="AIzaSyApYe4a_A751QsCDjb_dF_gUpOCn0Rr3r8")
model = genai.GenerativeModel("gemini-1.5-flash")

In [9]:
def parse_triples(response_text):
    """Fallback parser for tuple-style triples like (s, p, o)."""
    triples = []
    lines = response_text.strip().split("\n")
    for line in lines:
        line = line.strip().strip("()")
        if not line:
            continue
        parts = [p.strip() for p in line.split(",")]
        if len(parts) != 3:
            continue
        s, p, o = parts
        triples.append({"s": s, "p": p, "o": o})
    return triples

In [10]:
max_retries = 5
initial_backoff = 10

def call_gemini_with_backoff(prompt):
    """Call Gemini API with exponential backoff on rate limits."""
    backoff = initial_backoff
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            if "429" in str(e):  # Rate limit hit
                logging.warning(f"Rate limit hit. Backing off for {backoff}s...")
                time.sleep(backoff)
                backoff *= 2  # exponential backoff
            else:
                logging.error(f"Gemini API error: {e}")
                return None
    return None

In [11]:
def parse_tuples(response_text):
    """Parse tuple format: (subject, predicate, object)"""
    # Regular expression to match tuples with proper handling of nested parentheses
    tuple_pattern = r'\(([^,()]+),\s*([^,()]+),\s*([^)]+)\)'
    matches = re.findall(tuple_pattern, response_text)
    
    triples = []
    for match in matches:
        subject, predicate, object_val = match
        # Clean up whitespace
        subject = subject.strip()
        predicate = predicate.strip()
        object_val = object_val.strip()
        
        # Create triple dictionary matching the expected format
        triple = {
            's': subject,
            'p': predicate, 
            'o': object_val
        }
        triples.append(triple)
    
    return triples


In [56]:
output_stream = open(output_file, 'w')
extracted_relations = []

with open(output_file, "w", encoding="utf-8") as output_stream:
    for concept_id, (concept_name, concept_data) in tqdm(enumerate(data.items()), total=len(data)):
        abstracts = concept_data.get("abstracts", [])
        if not abstracts:
            continue

        prompt = build_prompt(concept_name, abstracts)
        print(prompt)
        response_text = call_gemini_with_backoff(prompt)
        print(response_text)
        
        if not response_text or response_text == "None":
            continue
        
        # Parse the tuple response instead of JSON
        try:
            response_triples = parse_tuples(response_text)
            
            if not response_triples:
                print(f"No valid tuples found for concept {concept_name}")
                continue
            
            for triple in response_triples:
                # Validate that the relation type exists in relation_def
                if triple.get('p') not in list(relation_def.keys()):
                    continue
                else:
                    extracted_relations.append(triple['p'])

                # Add metadata to the triple
                triple['id'] = concept_id
                triple['concept'] = concept_name
                
                # Write each triple as a JSON line
                output_stream.write(json.dumps(triple) + '\n')
                
        except Exception as e:
            print(f"Error parsing tuples for concept {concept_name}: {e}")
            # Optionally log the problematic response
            print(f"Problematic response: {response_text[:200]}...")
            continue

output_stream.close()

  0%|          | 0/1 [00:00<?, ?it/s]

You are a domain expert in computer science and Data Structure and algorithms, currently building a knowledge graph. Given a block of content and a query concept, perform the following tasks:

1. Identify the **query concept** and extract other **fine-grained in-domain concepts** mentioned in the content. These should be concepts that could be the focus of a lecture, section, or have a dedicated Wikipedia page.

2. Identify **semantic relationships** between the query concept and the other extracted concepts in the form of **triplets**:  
   Format: (head concept, relation type, tail concept)  
   The query concept can be either the head or the tail in a triplet.

3. Use only the following 7 **relation types** and be mindful of directionality:  
Compare: Represents a relationship between two or more entities where a comparison is being made. For example, "A is larger than B" or "X is more efficient than Y."
Part-of: Denotes a relationship where one entity is a constituent or component 

100%|██████████| 1/1 [00:02<00:00,  2.48s/it]

(singly linked list,Part-of,Linked List)(Linked List,Is-a-Prerequisite-of,Data Structure and Algorithms)(singly linked list,Compare,doubly linked list)(singly linked list,Used-for,Data Structure and Algorithms)(singly linked list,Feature-of,head)(singly linked list,Feature-of,tail)(singly linked list,Feature-of,node)(node,Part-of,singly linked list)(doubly linked list,Compare,singly linked list)(doubly linked list,Part-of,Linked List)(traversal,Method-for,searching singly linked list)(searching,Method-for,traversal)(insertion,Method-for,adding a node)(adding a node,Method-for,insertion)(removal,Method-for,deleting a node)(deleting a node,Method-for,removal)(head,Part-of,singly linked list)(tail,Part-of,singly linked list)(O(1),Compare,O(n))(singly linked list,Evaluate-for,insertion)(doubly linked list,Evaluate-for,non-trivial operations)(doubly linked list,Evaluate-for,bi-directional traversal)(Binary Search Tree,Compare,Linked List)



