In [1]:
from pathlib import Path
import sys

from dotenv import load_dotenv
import os

load_dotenv()  # This loads the variables from .env
api_key = os.getenv('OPENAI_API_KEY')  # This gets a specific variable

# Add the src directory to Python path
src_path = "src"
sys.path.append(str(src_path))

from matsci_llm_causality.extraction.pdf import PDFProcessor
from matsci_llm_causality.models import create_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize components
pdf_processor = PDFProcessor()
entity_recognizer = create_model("gpt-5-entity")
relation_extractor = create_model("gpt-5-relation")

# Path to your PDF
pdf_path = Path("D:/Research/LLM4Causal/tests/data/sciadv.abo6043.pdf")  # Replace with your PDF path
text_path = Path("tests/data/text.txt")

In [3]:
with open(text_path, 'r', encoding='utf-8') as f:
    content = f.read()
paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]

print(paragraphs[:3])  # show first 3 paragraphs

['Modern genomics combined with advanced bioinformatics methodologies allow us to understand much more about complex living systems than was ever previously possible. In the realm of human biology, for instance, recent developments have given us the ability to pinpoint the genes influencing diseases such as cancers. One area where these novel technologies can be anticipated to exert a huge impact but have thus far remained underused is the study of structural biomaterials. Spider silk is a prime example of an extended phenotype, whose extraordinary mechanical properties are governed by the underlying composition and structure of protein building blocks called spidroins.', 'All spiders use silk for various critical purposes, including foraging, locomotion, nesting, mating, egg protection, and communication  (1) . Different types of threads are used for diverse purposes, each produced in specific glands in the abdomen  (2) . For example, orb-weaving spiders use up to seven different type

In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed

MAX_WORKERS = 50  # tune to your rate limits

def extract_entities_sync(paragraph):
    # Sync call
    return entity_recognizer.extract_entities(paragraph)

def extract_relations_sync(paragraph, all_entities):
    # Sync call
    return relation_extractor.extract_relations(paragraph, all_entities)


total = len(paragraphs)

# ---------- Phase 1: Entities ----------
print(f"[Phase 1] Extracting entities from {total} paragraphs...")
all_entities = []
seen = set()
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = {ex.submit(extract_entities_sync, p): i for i, p in enumerate(paragraphs)}
    for idx, fut in enumerate(as_completed(futures), 1):
        try:
            ents = fut.result()
        except Exception as e:
            print(f"⚠️ Error in entity extraction: {e}")
            continue

        for ent in ents:
            key = ent.text.lower()
            if key not in seen:
                seen.add(key)
                all_entities.append(ent)

        if idx % 10 == 0 or idx == total:  # print every 10 completions
            print(f"  - Processed {idx}/{total} paragraphs for entities")

print(f"[Phase 1 Done] Total unique entities: {len(all_entities)}")


[Phase 1] Extracting entities from 43 paragraphs...
  - Processed 10/43 paragraphs for entities
  - Processed 10/43 paragraphs for entities
  - Processed 20/43 paragraphs for entities
  - Processed 20/43 paragraphs for entities
  - Processed 30/43 paragraphs for entities
  - Processed 30/43 paragraphs for entities
  - Processed 40/43 paragraphs for entities
  - Processed 40/43 paragraphs for entities
  - Processed 43/43 paragraphs for entities
[Phase 1 Done] Total unique entities: 287
  - Processed 43/43 paragraphs for entities
[Phase 1 Done] Total unique entities: 287


In [8]:
total = len(paragraphs)
# ---------- Phase 2: Relations ----------
print(f"[Phase 2] Extracting relations from {total} paragraphs...")
rel_set = set()
all_relationships = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = {ex.submit(extract_relations_sync, p, all_entities): i for i, p in enumerate(paragraphs)}
    for idx, fut in enumerate(as_completed(futures), 1):
        try:
            rels = fut.result()
        except Exception as e:
            print(f"⚠️ Error in relation extraction: {e}")
            continue

        for rel in rels:
            key = (rel.subject.text, rel.object.text, rel.relation_type)
            if key not in rel_set:
                rel_set.add(key)
                all_relationships.append(rel)

        if idx % 10 == 0 or idx == total:
            print(f"  - Processed {idx}/{total} paragraphs for relations")

print(f"[Phase 2 Done] Total unique relationships: {len(all_relationships)}")


[Phase 2] Extracting relations from 43 paragraphs...
  - Processed 10/43 paragraphs for relations
  - Processed 10/43 paragraphs for relations
  - Processed 20/43 paragraphs for relations
  - Processed 20/43 paragraphs for relations
  - Processed 30/43 paragraphs for relations
  - Processed 30/43 paragraphs for relations
⚠️ Error in relation extraction: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-pcx0SC2kFfkxTUuztFPkUdSR on tokens per min (TPM): Limit 30000, Used 30000, Requested 4292. Please try again in 8.584s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
⚠️ Error in relation extraction: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-pcx0SC2kFfkxTUuztFPkUdSR on tokens per min (TPM): Limit 30000, Used 30000, Requested 4261. Please try again in 8.522s. Visit https://platform.openai.com/account/rate-limits t

In [6]:
# # Initialize empty lists to store unique entities and relationships
# all_entities = []
# all_relationships = []
# entity_texts = set()  # To track unique entity texts

# print("Processing paragraphs for entities and relationships...")
# for i, paragraph in enumerate(paragraphs, 1):
#     print(f"\nProcessing paragraph {i}/{len(paragraphs)}...")
    
#     # Extract entities from current paragraph
#     new_entities = entity_recognizer.extract_entities(paragraph)
    
#     # Add only new entities
#     entities_added = False
#     for entity in new_entities:
#         if entity.text.lower() not in entity_texts:
#             entity_texts.add(entity.text.lower())
#             all_entities.append(entity)
#             entities_added = True
    
#     # Extract relationships using all known entities
#     new_relationships = relation_extractor.extract_relations(paragraph, all_entities)
    
#     # Add new relationships (checking for duplicates based on subject, object, and type)
#     relationships_added = False
#     for rel in new_relationships:
#         # Create a unique key for the relationship
#         rel_key = (rel.subject.text, rel.object.text, rel.relation_type)
#         if not any(
#             (r.subject.text, r.object.text, r.relation_type) == rel_key 
#             for r in all_relationships
#         ):
#             all_relationships.append(rel)
#             relationships_added = True
    
#     # If there were updates, print the current state
#     if entities_added or relationships_added:
#         print("\nCurrent Entities:")
#         for entity in all_entities:
#             print(f"- {entity.text} ({entity.type.value})")
        
#         print("\nCurrent Relationships:")
#         if all_relationships:
#             for rel in all_relationships:
#                 print(f"- {rel}")
#         else:
#             print("No relationships found yet")

In [9]:
import json

print(f"\nFinal Results:")
print(f"Total unique entities: {len(all_entities)}")
print(f"Total unique relationships: {len(all_relationships)}")

# Create complete dictionaries for entities and relationships
entities_dict = [
    {
        "id": entity.id,
        "text": entity.text,
        "type": entity.type.value,
        "aliases": entity.aliases,
        "metadata": entity.metadata
    } 
    for entity in all_entities
]

relationships_dict = [
    {
        "subject": {
            "id": rel.subject.id,
            "text": rel.subject.text,
            "type": rel.subject.type.value
        },
        "object": {
            "id": rel.object.id,
            "text": rel.object.text,
            "type": rel.object.type.value
        },
        "relation_type": rel.relation_type.value,
        "polarity": rel.polarity,
        "confidence": rel.confidence,
        "evidence": rel.evidence,
        "metadata": rel.metadata
    }
    for rel in all_relationships
] if all_relationships else []

# Save entities to JSON file with complete information
with open('entities.json', 'w', encoding='utf-8') as f:
    json.dump(entities_dict, f, indent=4, ensure_ascii=False)

# Save relationships to JSON file with complete information
with open('relationships.json', 'w', encoding='utf-8') as f:
    json.dump(relationships_dict, f, indent=4, ensure_ascii=False)

print("\nFiles saved successfully with complete data structures!")


Final Results:
Total unique entities: 287
Total unique relationships: 46

Files saved successfully with complete data structures!
