In [1]:
from pathlib import Path
import sys

from dotenv import load_dotenv
import os

load_dotenv()  # This loads the variables from .env
api_key = os.getenv('OPENAI_API_KEY')  # This gets a specific variable

# Add the src directory to Python path
src_path = "src"
sys.path.append(str(src_path))

from matsci_llm_causality.extraction.pdf import PDFProcessor
from matsci_llm_causality.models import create_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize components
pdf_processor = PDFProcessor()
relation_extractor = create_model("gpt-5-relation")

# Path to your PDF
pdf_path = Path("D:/Research/LLM4Causal/tests/data/sciadv.abo6043.pdf")  # Replace with your PDF path
text_path = Path("tests/data/text.txt")

In [3]:
with open(text_path, 'r', encoding='utf-8') as f:
    content = f.read()
paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]

print(paragraphs[:3])  # show first 3 paragraphs

['Modern genomics combined with advanced bioinformatics methodologies allow us to understand much more about complex living systems than was ever previously possible. In the realm of human biology, for instance, recent developments have given us the ability to pinpoint the genes influencing diseases such as cancers. One area where these novel technologies can be anticipated to exert a huge impact but have thus far remained underused is the study of structural biomaterials. Spider silk is a prime example of an extended phenotype, whose extraordinary mechanical properties are governed by the underlying composition and structure of protein building blocks called spidroins.', 'All spiders use silk for various critical purposes, including foraging, locomotion, nesting, mating, egg protection, and communication  (1) . Different types of threads are used for diverse purposes, each produced in specific glands in the abdomen  (2) . For example, orb-weaving spiders use up to seven different type

In [4]:
import re
import json
def parse_relationships(text: str):
    # Split text into lines, stripping empty ones
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    
    results = []
    for line in lines:
        # Regex to capture: [SubjectName][Type] relationship [ObjectName][Type]
        match = re.match(r"(.+?)\[(.*?)\]\s+(increases|decreases|positively correlate with|negatively correlate with|causes)\s+(.+?)\[(.*?)\]$", line)
        if match:
            subject_name, subject_type, relation, object_name, object_type = match.groups()
            results.append({
                "subject": {"name": subject_name.strip(), "type": subject_type.strip()},
                "relationship": relation.strip(),
                "object": {"name": object_name.strip(), "type": object_type.strip()}
            })
    return results

In [5]:
# Initialize lists to store all relationships and entities
all_relationships = []
all_entities = set()  # Using a set to avoid duplicates

# Process each paragraph
print("Processing paragraphs...")
for i, paragraph in enumerate(paragraphs):
    print(f"\nProcessing paragraph {i+1}/{len(paragraphs)}")
    
    # Extract relationships using GPT-5
    result = relation_extractor.extract_relations(paragraph)
    
    # Parse the relationships from the raw text output
    parsed_rels = parse_relationships(result)
    
    # Add relationships to our list
    all_relationships.extend(parsed_rels)
    
    # Extract entities from relationships and add to set
    for rel in parsed_rels:
        # Add subject entity
        all_entities.add(f"{rel['subject']['name']}|{rel['subject']['type']}")
        # Add object entity
        all_entities.add(f"{rel['object']['name']}|{rel['object']['type']}")
    
    # Print progress
    if parsed_rels:
        print(f"Found {len(parsed_rels)} relationships")
    else:
        print("No relationships found in this paragraph")

# Convert entity strings back to dictionary format
all_entities = [
    {
        "name": entity.split("|")[0],
        "type": entity.split("|")[1]
    }
    for entity in all_entities
]

print(f"\nExtraction complete!")
print(f"Total relationships found: {len(all_relationships)}")
print(f"Total unique entities found: {len(all_entities)}")

Processing paragraphs...

Processing paragraph 1/43
No relationships found in this paragraph

Processing paragraph 2/43
No relationships found in this paragraph

Processing paragraph 2/43
Found 14 relationships

Processing paragraph 3/43
Found 14 relationships

Processing paragraph 3/43
Found 4 relationships

Processing paragraph 4/43
Found 4 relationships

Processing paragraph 4/43
Found 6 relationships

Processing paragraph 5/43
Found 6 relationships

Processing paragraph 5/43
No relationships found in this paragraph

Processing paragraph 6/43
No relationships found in this paragraph

Processing paragraph 6/43
No relationships found in this paragraph

Processing paragraph 7/43
No relationships found in this paragraph

Processing paragraph 7/43
Found 1 relationships

Processing paragraph 8/43
Found 1 relationships

Processing paragraph 8/43
No relationships found in this paragraph

Processing paragraph 9/43
No relationships found in this paragraph

Processing paragraph 9/43
Found 1 re

In [6]:

# Save entities to JSON file with complete information
with open('entities.json', 'w', encoding='utf-8') as f:
    json.dump(all_entities, f, indent=4, ensure_ascii=False)

# Save relationships to JSON file with complete information
with open('relationships.json', 'w', encoding='utf-8') as f:
    json.dump(all_relationships, f, indent=4, ensure_ascii=False)

print("\nFiles saved successfully with complete data structures!")


Files saved successfully with complete data structures!
