In [None]:
import os
import json
import re
import ollama
from PyPDF2 import PdfReader
#Model
MODEL_NAME = "llama3.2:latest"  
PAPERS_DIR = "C:\\Users\\wilda\\Downloads\\LLM Paper folder"   # folder containing PDFs
OUTPUT_DIR = "C:\\Users\\wilda\\Downloads\\LLM Paper folder"
os.makedirs(OUTPUT_DIR, exist_ok=True)

#few-shot examples
FEW_SHOT_EXAMPLES = [
    {
        "abstract": "Polyethylene microplastics were found in river water and caused oxidative stress in fish gills.",
        "output": {
            "nodes": [
                {"id": "n1", "type": "Polymer", "label": "Polyethylene Microplastics"},
                {"id": "n2", "type": "Medium", "label": "River Water"},
                {"id": "n3", "type": "Organism", "label": "Fish"},
                {"id": "n4", "type": "Effect", "label": "Oxidative Stress"},
                {"id": "n5", "type": "Organ", "label": "Fish Gills"}
            ],
            "relationships": [
                {"source": "n1", "relation_type": "Detected_In", "target": "n2"},
                {"source": "n1", "relation_type": "Causes", "target": "n4"},
                {"source": "n4", "relation_type": "Occurs_In", "target": "n5"},
                {"source": "n5", "relation_type": "Belongs_To", "target": "n3"}
            ]
        }
    },
    {
        "abstract": "Polystyrene nanoplastics reduced the viability of neural stem cells and inhibited cell differentiation.",
        "output": {
            "nodes": [
                {"id": "n1", "type": "Polymer", "label": "Polystyrene Nanoplastics"},
                {"id": "n2", "type": "Cell", "label": "Neural Stem Cells"},
                {"id": "n3", "type": "Effect", "label": "Reduced Viability"},
                {"id": "n4", "type": "Process", "label": "Cell Differentiation"}
            ],
            "relationships": [
                {"source": "n1", "relation_type": "Causes", "target": "n3"},
                {"source": "n1", "relation_type": "Inhibits", "target": "n4"},
                {"source": "n3", "relation_type": "Occurs_In", "target": "n2"}
            ]
        }
    }
]

# Helper functions
def read_paper(file_path):
    """Extracts text from either .pdf"""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        text = "\n".join([p.extract_text() or "" for p in reader.pages])
    else:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    return text.strip()

def extract_sections(text):
    """Roughly extract abstract and main body"""
    abstract = ""
    body = text
    abs_match = re.search(r"(?i)(?:abstract|summary)\s*[:\-]?\s*(.*?)\n(?=[A-Z])", text, re.S)
    if abs_match:
        abstract = abs_match.group(1).strip()
    # If the paper contains Introduction use that as boundary
    intro_match = re.search(r"(?i)introduction", text)
    if intro_match:
        body = text[intro_match.start():]
    return abstract, body

def build_fewshot_prompt(few_shots):
    examples = ""
    for ex in few_shots:
        examples += f"Abstract:\n{ex['abstract']}\nOutput JSON:\n{json.dumps(ex['output'], indent=2)}\n\n"
    return examples


# Main processing function
def process_paper(file_path, model_name=MODEL_NAME):
    text = read_paper(file_path)
    abstract, body = extract_sections(text)

    few_shot_prompt = build_fewshot_prompt(FEW_SHOT_EXAMPLES)
    prompt = f"""
You are building a microplastics knowledge graph from scientific data.

Here are some examples:
{few_shot_prompt}

Now, analyze the following research paper.

Abstract:
{abstract}

Full Text:
{body[:4000]}  # truncate to fit context

Instructions:
1. Extract scientific entities (nodes) such as pollutants, materials, organisms, organs, biological processes, and molecular pathways.
2. Create relationships that connect these entities logically (e.g., 'causes', 'affects', 'detected_in', 'regulates').
3. You may create new node or relation types if they naturally emerge from the paper.
4. Use ONLY information stated or implied in the paper — no external knowledge.
5. Output a single JSON object:
{{
  "nodes": [{{"id": "...", "type": "...", "label": "...", "attributes": {{...}}}}],
  "relationships": [{{"source": "...", "relation_type": "...", "target": "..."}}]
}}
    """

    print(f"Processing: {os.path.basename(file_path)} ...")
    response = ollama.generate(model=model_name, prompt=prompt, stream=False)
    output = response["response"]

    out_file = os.path.join(OUTPUT_DIR, os.path.basename(file_path) + "_kg.json")
    with open(out_file, "w", encoding="utf-8") as f:
        f.write(output)
    print(f"Saved knowledge graph to: {out_file}")

# Run on all papers
if __name__ == "__main__":
    papers = [os.path.join(PAPERS_DIR, f) for f in os.listdir(PAPERS_DIR) if f.endswith((".pdf"))][:5]
    for p in papers:
        process_paper(p)
