In [None]:
from pathlib import Path
import sys

from dotenv import load_dotenv
import os

In [None]:
load_dotenv()  # This loads the variables from .env
api_key = os.getenv('OPENAI_API_KEY')  # This gets a specific variable

# Add the src directory to Python path
src_path = "src"
sys.path.append(str(src_path))

from src.matsci_llm_causality.extraction.pdf import PDFProcessor
from src.matsci_llm_causality.models import create_model

In [None]:
# Initialize components
pdf_processor = PDFProcessor()
entity_recognizer = create_model("scibert")
relation_extractor = create_model("gpt-5-nano-2025-08-07")

# Path to your PDF
pdf_path = Path("D:/Research/LLM4Causal/tests/data/sciadv.abo6043.pdf")  # Replace with your PDF path

In [None]:
# 1. Extract text from PDF
print("Extracting text from PDF...")
text = pdf_processor.extract_text(pdf_path)
print(f"Extracted {len(text)} characters\n")

In [None]:
print(text)

In [None]:
# 2. Extract entities using SciBERT
print("Extracting entities...")
entities = entity_recognizer.extract_entities(text)
print("\nFound entities:")
for entity in entities:
    print(f"- {entity.text} ({entity.type.value})")

In [None]:
# 3. Extract relationships using FLAN-T5
print("\nExtracting relationships...")
result = relation_extractor.extract_relations(text)


In [None]:

# 4. Print results
print("\nExtracted relationships:")
if result.relationships:
    for rel in result.relationships:
        print(f"- {rel}")
else:
    print("Raw FLAN-T5 response:")
    print(result.metadata["raw_response"])

# Test Entity Recognition

In [None]:
import pytest
from pathlib import Path
import torch
from src.matsci_llm_causality.models.scibert import SciBERTEntityRecognizer
from src.matsci_llm_causality.schema import EntityType

# Test samples with known entities
TEST_SAMPLES = [
    {
        "text": "Silk fibroin exhibits increased crystallinity at higher temperatures.",
        "entities": [
            {"text": "Silk fibroin", "type": EntityType.MATERIAL},
            {"text": "crystallinity", "type": EntityType.PROPERTY},
            {"text": "temperatures", "type": EntityType.CONDITION}
        ]
    },
    {
        "text": "Beta-sheet content affects the mechanical properties through hydrogen bonding.",
        "entities": [
            {"text": "Beta-sheet content", "type": EntityType.STRUCTURE},
            {"text": "mechanical properties", "type": EntityType.PROPERTY}
        ]
    }
]

In [None]:
entity_recognizer = GPT5EntityRecognizer()

In [None]:
for sample in TEST_SAMPLES:
    entities = entity_recognizer.extract_entities(sample["text"])
    print(f"Extract Entities: {entities}")

# Process PDF

In [1]:
from pathlib import Path
import sys

from dotenv import load_dotenv
import os

load_dotenv()  # This loads the variables from .env
api_key = os.getenv('OPENAI_API_KEY')  # This gets a specific variable

# Add the src directory to Python path
src_path = "src"
sys.path.append(str(src_path))

from matsci_llm_causality.extraction.pdf import PDFProcessor
from matsci_llm_causality.models import create_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize components
pdf_processor = PDFProcessor()
# entity_recognizer = create_model("gpt-5-entity")
relation_extractor = create_model("gpt-5-relation")

# Path to your PDF
pdf_path = Path("D:/Research/LLM4Causal/tests/data/sciadv.abo6043.pdf")  # Replace with your PDF path

In [3]:
# 1. Extract text from PDF
print("Extracting text from PDF...")
text = pdf_processor.extract_text(pdf_path)
print(f"Extracted {len(text)} characters\n")

Extracting text from PDF...
Extracted 79998 characters



In [4]:
# # 2. Extract entities using SciBERT
# print("Extracting entities...")
# entities = entity_recognizer.extract_entities(text)
# print("\nFound entities:")
# for entity in entities:
#     print(f"- {entity.text} ({entity.type.value})")


In [5]:

# 3. Extract relationships using FLAN-T5
print("\nExtracting relationships...")
relationships = relation_extractor.extract_relations(text)



Extracting relationships...


In [6]:
relationships

'Spider silk[Material] positively correlates with high tensile strength[Property]\n\nSpider silk[Material] positively correlates with extensibility[Property]\n\nSpider silk[Material] positively correlates with toughness[Property]\n\nMajor ampullate spidroin (MaSp)[Material] causes high tensile strength[Property]\n\nMajor ampullate spidroin (MaSp)[Material] causes toughness[Property]\n\nMulticomponent structures with major ampullate spidroin 1 to 3 paralogs[Structure] causes high-performance dragline silks[Property]\n\nNumerous amino acid motifs[Structure] causes silk properties[Property]\n\nSupercontraction[Property] negatively correlates with industrial usefulness[Property]\n\nSequence features of MaSp2[Material] causes elasticity[Property]\n\nSequence features of MaSp2[Material] causes supercontraction[Property]\n\nPresence of MaSp3[Structure] increases toughness[Property]\n\nPolyalanine sequences[Structure] negatively correlates with supercontraction[Property]\n\nAmorphous regions[S

In [7]:
print(relationships)

Spider silk[Material] positively correlates with high tensile strength[Property]

Spider silk[Material] positively correlates with extensibility[Property]

Spider silk[Material] positively correlates with toughness[Property]

Major ampullate spidroin (MaSp)[Material] causes high tensile strength[Property]

Major ampullate spidroin (MaSp)[Material] causes toughness[Property]

Multicomponent structures with major ampullate spidroin 1 to 3 paralogs[Structure] causes high-performance dragline silks[Property]

Numerous amino acid motifs[Structure] causes silk properties[Property]

Supercontraction[Property] negatively correlates with industrial usefulness[Property]

Sequence features of MaSp2[Material] causes elasticity[Property]

Sequence features of MaSp2[Material] causes supercontraction[Property]

Presence of MaSp3[Structure] increases toughness[Property]

Polyalanine sequences[Structure] negatively correlates with supercontraction[Property]

Amorphous regions[Structure] increases super

In [9]:
import re
import json
def parse_relationships(text: str):
    # Split text into lines, stripping empty ones
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    
    results = []
    for line in lines:
        # Regex to capture: [SubjectName][Type] relationship [ObjectName][Type]
        match = re.match(r"(.+?)\[(.*?)\]\s+(increases|decreases|positively correlate with|negatively correlate with|causes)\s+(.+?)\[(.*?)\]$", line)
        if match:
            subject_name, subject_type, relation, object_name, object_type = match.groups()
            results.append({
                "subject": {"name": subject_name.strip(), "type": subject_type.strip()},
                "relationship": relation.strip(),
                "object": {"name": object_name.strip(), "type": object_type.strip()}
            })
    return results

parsed = parse_relationships(relationships)

In [None]:

# 4. Print results
print("\nExtracted relationships:")
if relationships:
    for rel in relationships:
        print(f"- {rel}")
else:
    print("No Relationships")

In [None]:
import json

# Create complete dictionaries for entities and relationships
entities_dict = [
    {
        "id": entity.id,
        "text": entity.text,
        "type": entity.type.value,
        "aliases": entity.aliases,
        "metadata": entity.metadata
    } 
    for entity in entities
]

relationships_dict = [
    {
        "subject": {
            "id": rel.subject.id,
            "text": rel.subject.text,
            "type": rel.subject.type.value
        },
        "object": {
            "id": rel.object.id,
            "text": rel.object.text,
            "type": rel.object.type.value
        },
        "relation_type": rel.relation_type.value,
        "polarity": rel.polarity,
        "confidence": rel.confidence,
        "evidence": rel.evidence,
        "metadata": rel.metadata
    }
    for rel in relationships
] if relationships else []

# Save entities to JSON file with complete information
with open('entities.json', 'w', encoding='utf-8') as f:
    json.dump(entities_dict, f, indent=4, ensure_ascii=False)

# Save relationships to JSON file with complete information
with open('relationships.json', 'w', encoding='utf-8') as f:
    json.dump(relationships_dict, f, indent=4, ensure_ascii=False)

print("Files saved successfully with complete data structures!")