In [None]:
from pathlib import Path
import sys

from dotenv import load_dotenv
import os

In [None]:
load_dotenv()  # This loads the variables from .env
api_key = os.getenv('OPENAI_API_KEY')  # This gets a specific variable

# Add the src directory to Python path
src_path = "src"
sys.path.append(str(src_path))

from src.matsci_llm_causality.extraction.pdf import PDFProcessor
from src.matsci_llm_causality.models import create_model

In [None]:
# Initialize components
pdf_processor = PDFProcessor()
entity_recognizer = create_model("scibert")
relation_extractor = create_model("gpt-5-nano-2025-08-07")

# Path to your PDF
pdf_path = Path("D:/Research/LLM4Causal/tests/data/sciadv.abo6043.pdf")  # Replace with your PDF path

In [None]:
# 1. Extract text from PDF
print("Extracting text from PDF...")
text = pdf_processor.extract_text(pdf_path)
print(f"Extracted {len(text)} characters\n")

In [None]:
print(text)

In [None]:
# 2. Extract entities using SciBERT
print("Extracting entities...")
entities = entity_recognizer.extract_entities(text)
print("\nFound entities:")
for entity in entities:
    print(f"- {entity.text} ({entity.type.value})")

In [None]:
# 3. Extract relationships using FLAN-T5
print("\nExtracting relationships...")
result = relation_extractor.extract_relations(text)


In [None]:

# 4. Print results
print("\nExtracted relationships:")
if result.relationships:
    for rel in result.relationships:
        print(f"- {rel}")
else:
    print("Raw FLAN-T5 response:")
    print(result.metadata["raw_response"])

# Test Entity Recognition

In [None]:
import pytest
from pathlib import Path
import torch
from src.matsci_llm_causality.models.scibert import SciBERTEntityRecognizer
from src.matsci_llm_causality.models.llm.gpt import GPT5EntityRecognizer
from src.matsci_llm_causality.schema import EntityType

# Test samples with known entities
TEST_SAMPLES = [
    {
        "text": "Silk fibroin exhibits increased crystallinity at higher temperatures.",
        "entities": [
            {"text": "Silk fibroin", "type": EntityType.MATERIAL},
            {"text": "crystallinity", "type": EntityType.PROPERTY},
            {"text": "temperatures", "type": EntityType.CONDITION}
        ]
    },
    {
        "text": "Beta-sheet content affects the mechanical properties through hydrogen bonding.",
        "entities": [
            {"text": "Beta-sheet content", "type": EntityType.STRUCTURE},
            {"text": "mechanical properties", "type": EntityType.PROPERTY}
        ]
    }
]

In [None]:
entity_recognizer = GPT5EntityRecognizer()

In [None]:
for sample in TEST_SAMPLES:
    entities = entity_recognizer.extract_entities(sample["text"])
    print(f"Extract Entities: {entities}")

# Process PDF

In [1]:
from pathlib import Path
import sys

from dotenv import load_dotenv
import os

load_dotenv()  # This loads the variables from .env
api_key = os.getenv('OPENAI_API_KEY')  # This gets a specific variable

# Add the src directory to Python path
src_path = "src"
sys.path.append(str(src_path))

from matsci_llm_causality.extraction.pdf import PDFProcessor
from matsci_llm_causality.models import create_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize components
pdf_processor = PDFProcessor()
entity_recognizer = create_model("gpt-5-entity")
relation_extractor = create_model("gpt-5-relation")

# Path to your PDF
pdf_path = Path("D:/Research/LLM4Causal/tests/data/sciadv.abo6043.pdf")  # Replace with your PDF path

In [None]:
# 1. Extract text from PDF
print("Extracting text from PDF...")
text = pdf_processor.extract_text(pdf_path)
print(f"Extracted {len(text)} characters\n")

Extracting text from PDF...
Extracted 79998 characters



In [4]:
# 2. Extract entities using SciBERT
print("Extracting entities...")
entities = entity_recognizer.extract_entities(text)
print("\nFound entities:")
for entity in entities:
    print(f"- {entity.text} ({entity.type.value})")


Extracting entities...
ChatCompletion(id='chatcmpl-CDxqEQpCEniLsEqewLtmqONxzY49R', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='text: Spider silks  \ntype: MATERIAL  \nstart: 0  \nend: 13  \nconfidence: 0.99  \n\ntext: renewable  \ntype: PROPERTY  \nstart: 85  \nend: 94  \nconfidence: 0.85  \n\ntext: biodegradable  \ntype: PROPERTY  \nstart: 96  \nend: 109  \nconfidence: 0.85  \n\ntext: sustainable  \ntype: PROPERTY  \nstart: 111  \nend: 122  \nconfidence: 0.85  \n\ntext: biopolymers  \ntype: MATERIAL  \nstart: 123  \nend: 134  \nconfidence: 0.90', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None, annotations=[]))], created=1757444330, model='o4-mini-2025-04-16', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=3602, prompt_tokens=23824, total_tokens=27426, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0,

In [5]:

# 3. Extract relationships using FLAN-T5
print("\nExtracting relationships...")
relationships = relation_extractor.extract_relations(text, entities)



Extracting relationships...


In [6]:

# 4. Print results
print("\nExtracted relationships:")
if relationships:
    for rel in relationships:
        print(f"- {rel}")
else:
    print("No Relationships")


Extracted relationships:
- subject=Entity(id='entity_0', text='Spider silks', type=<EntityType.MATERIAL: 'material'>, aliases=[], metadata={'start_char': 0, 'end_char': 13, 'confidence': 0.99}) object=Entity(id='entity_1', text='renewable', type=<EntityType.PROPERTY: 'property'>, aliases=[], metadata={'start_char': 85, 'end_char': 94, 'confidence': 0.85}) relation_type=<RelationType.CORRELATES: 'correlates_with'> polarity=1 confidence=0.9 evidence='“Spider silks are among the toughest known materials and thus provide models for renewable, biodegradable, and sustainable biopolymers.”' metadata={}
- subject=Entity(id='entity_0', text='Spider silks', type=<EntityType.MATERIAL: 'material'>, aliases=[], metadata={'start_char': 0, 'end_char': 13, 'confidence': 0.99}) object=Entity(id='entity_2', text='biodegradable', type=<EntityType.PROPERTY: 'property'>, aliases=[], metadata={'start_char': 96, 'end_char': 109, 'confidence': 0.85}) relation_type=<RelationType.CORRELATES: 'correlates_with'>

In [7]:
import json

# Create complete dictionaries for entities and relationships
entities_dict = [
    {
        "id": entity.id,
        "text": entity.text,
        "type": entity.type.value,
        "aliases": entity.aliases,
        "metadata": entity.metadata
    } 
    for entity in entities
]

relationships_dict = [
    {
        "subject": {
            "id": rel.subject.id,
            "text": rel.subject.text,
            "type": rel.subject.type.value
        },
        "object": {
            "id": rel.object.id,
            "text": rel.object.text,
            "type": rel.object.type.value
        },
        "relation_type": rel.relation_type.value,
        "polarity": rel.polarity,
        "confidence": rel.confidence,
        "evidence": rel.evidence,
        "metadata": rel.metadata
    }
    for rel in relationships
] if relationships else []

# Save entities to JSON file with complete information
with open('entities.json', 'w', encoding='utf-8') as f:
    json.dump(entities_dict, f, indent=4, ensure_ascii=False)

# Save relationships to JSON file with complete information
with open('relationships.json', 'w', encoding='utf-8') as f:
    json.dump(relationships_dict, f, indent=4, ensure_ascii=False)

print("Files saved successfully with complete data structures!")

Files saved successfully with complete data structures!
