# Generating DAG's on Wikipedia articles with Langchain

## Introduction
Yesterday I was bored, and I've already done some work generating protein association networks with Cytoscape and String.
I was wondering if this could be extended to other domains with the use of LLM's to generate JSON graph data.
I tried it with an paragraph in an article about the [Muslim Brotherhood](https://en.wikipedia.org/wiki/Muslim_Brotherhood) since I had to write a final paper on that for my first year seminar at UCLA and it yielded somewhat interesting results.
I was able to import the JSON into Cytoscape and do an analysis, but I want to try investigating the feasibility of doing this on a larger scale.


## Load Wikipedia article
We'll use Langchain's loaders to scrape Wikipedia articles.
This way, we can programmatically get Wikipedia articles and chunk them later.

In [25]:
from langchain_community.document_loaders import WikipediaLoader
import dotenv

dotenv.load_dotenv()

True

In [None]:
import asyncio, time, threading
from concurrent.futures import ThreadPoolExecutor
import ipywidgets as widgets
from IPython.display import display
from ipycytoscape import CytoscapeWidget

# Widgets
prompt = widgets.Text(placeholder='Enter article…', description='Article:', layout=widgets.Layout(width='60%'))
run_btn = widgets.Button(description='Generate graph', button_style='primary', icon='play')
cancel_btn = widgets.Button(description='Cancel', icon='stop', disabled=True)
status = widgets.HTML(value='')
elapsed = widgets.Label(value='')
spinner = widgets.HTML(value='')

# Cytoscape widget
cyto = CytoscapeWidget()

cyto.layout.height = "800px"
cyto.layout.width = "1200px"

cyto.set_style([
    # Nodes
    {
        'selector': 'node',
        'style': {
            'background-color': '#4a90e2',
            'width': 'mapData(weight, 0, 100, 30, 60)',  # or a fixed size like '40px'
            'height': 'mapData(weight, 0, 100, 30, 60)',

            # Label
            'label': 'data(label)',          # assumes your node data has a 'label' field
            'color': '#222',
            'font-size': '12px',
            'font-family': 'Inter, Helvetica, Arial, sans-serif',
            'text-valign': 'center',
            'text-halign': 'center',
            'text-wrap': 'wrap',             # wrap long labels
            'text-max-width': '120px',       # limit label width to prevent overflow
            'text-margin-y': '0px',
            'text-outline-width': 2,         # halo around text to improve contrast
            'text-outline-color': '#ffffff',
            'z-index-compare': 'manual',
            'z-index': 10
        }
    },

    # Node hover/selection emphasis
    {
        'selector': 'node:selected',
        'style': {
            'border-width': 3,
            'border-color': '#ff7f0e',
            'font-size': '14px',
            'text-outline-width': 3
        }
    },
    {
        'selector': 'node:hover',
        'style': {
            'border-width': 2,
            'border-color': '#ff7f0e',
            'cursor': 'pointer'
        }
    },

    # Edges
    {
        'selector': 'edge',
        'style': {
            'line-color': '#9aa4b2',
            'width': 2,
            'curve-style': 'bezier',
            'target-arrow-shape': 'triangle',
            'target-arrow-color': '#9aa4b2',

            # Edge labels (if you have 'label' or 'relation_type' in edge.data)
            'label': 'data(label)',          # or 'data(relation_type)'
            'font-size': '10px',
            'color': '#1f2937',
            'text-rotation': 'autorotate',
            'text-outline-width': 2,
            'text-outline-color': '#ffffff',
            'text-background-opacity': 0.75, # optional chip behind text
            'text-background-color': '#ffffff',
            'text-background-shape': 'roundrectangle',
            'text-background-padding': '2px',
        }
    },

    # Edge hover/selection
    {
        'selector': 'edge:selected',
        'style': {
            'width': 3,
            'line-color': '#ff7f0e',
            'target-arrow-color': '#ff7f0e',
            'font-size': '12px',
            'text-outline-width': 3
        }
    },
    {
        'selector': 'edge:hover',
        'style': {
            'width': 3,
            'line-color': '#6b7280',
            'target-arrow-color': '#6b7280',
            'cursor': 'pointer'
        }
    }
])



ui = widgets.VBox([
    widgets.HBox([prompt, run_btn, cancel_btn]),
    widgets.HBox([widgets.HTML('<b>Status:</b>'), status, elapsed]),
    spinner,
    cyto
])


display(ui)



VBox(children=(HBox(children=(Text(value='', description='Article:', layout=Layout(width='60%'), placeholder='…

In [27]:
def show_spinner(on=True, text='Working…'):
    spinner.value = f'<div style="color:#3182bd;">⏳ {text}</div>' if on else ''


In [28]:
def load_cyjs_into_cyto(cyto_widget: CytoscapeWidget, elements: dict):
    # elements must be { "elements": { "nodes": [...], "edges": [...] } }
    # or { "elements": [...] } — ipycytoscape accepts both.
    cyto_widget.graph.clear()
    cyto_widget.graph.add_graph_from_json(elements.get("elements"))
    cyto_widget.set_layout(name='cose')  # pick an appropriate layout


In [29]:
# Minimal schemas for a temporal causal graph (Cytoscape-ready)

from enum import Enum
from typing import List, Optional, Dict, Any, Tuple, Literal
from pydantic import BaseModel, Field, constr, confloat
from datetime import date


# --- Enums ---

class NodeType(str, Enum):
    Event = "Event"
    Person = "Person"
    Organization = "Organization"
    Place = "Place"
    Concept = "Concept"


class RelationType(str, Enum):
    causes = "causes"
    leads_to = "leads_to"
    enables = "enables"
    triggers = "triggers"
    prevents = "prevents"
    mitigates = "mitigates"


Polarity = Literal[1, -1]  # +1 promotes/enables; -1 inhibits/prevents


# --- Evidence & Provenance (minimal) ---

class Evidence(BaseModel):
    quote: constr(strip_whitespace=True, min_length=1)
    citation_ids: List[str] = Field(default_factory=list)
    section: Optional[str] = None
    char_spans: Optional[Tuple[int, int]] = None


class Provenance(BaseModel):
    article_id: Optional[str] = None
    revision_id: Optional[str] = None
    run_id: Optional[str] = None


# --- Core graph models ---

class EventNode(BaseModel):
    id: constr(strip_whitespace=True, min_length=1)
    label: constr(strip_whitespace=True, min_length=1)
    type: NodeType = NodeType.Event
    time_start: Optional[date] = None
    time_end: Optional[date] = None
    wikidata_id: Optional[str] = None
    provenance: Optional[Provenance] = None
    meta: Dict[str, Any] = Field(default_factory=dict)


class CausalEdge(BaseModel):
    id: constr(strip_whitespace=True, min_length=1)
    source: constr(strip_whitespace=True, min_length=1)
    target: constr(strip_whitespace=True, min_length=1)
    relation_type: RelationType
    polarity: Polarity = 1
    confidence: confloat(ge=0.0, le=1.0) = 0.5
    lag_days: Optional[int] = None
    evidence: List[Evidence] = Field(default_factory=list)
    temporal_valid: bool = True
    provisional: bool = False
    meta: Dict[str, Any] = Field(default_factory=dict)


# --- Cytoscape export wrappers (minimal) ---

class CyNode(BaseModel):
    data: EventNode


class CyEdge(BaseModel):
    data: CausalEdge


class CyElements(BaseModel):
    nodes: List[CyNode] = Field(default_factory=list)
    edges: List[CyEdge] = Field(default_factory=list)


In [30]:
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import init_chat_model
parser = PydanticOutputParser(pydantic_object=CyElements)
format_instructions = parser.get_format_instructions()

SYSTEM = """You are extracting a causal, temporal graph from the provided document.
Goal: Return a Cytoscape-ready JSON object with nodes and edges.

Rules:
- Only include nodes that are concrete events/entities referenced in the document.
- Event node should be Event, Person, Organization, Place, Concept from the allowed set.
- Only include causal edges (cause → effect). Use relation_type from the allowed set (causes, leads_to, enables, triggers, prevents or mitigates)
- Set polarity: +1 for promotes/enables/causes; -1 for prevents/mitigates.
- Provide at least one evidence.quote per edge. Include citation_ids/section if visible.
- Use concise labels; include a year if present (e.g., "(1914)").
- Generate unique, stable ids. For nodes: 'evt_<slug>' etc. For edges: 'e_<src>_<dst>_<relation>'.
- If dates are known, populate time_start/time_end as strings (YYYY-MM-DDT00:00:00Z). If not possible, leave as null. You're not going to know hour minute second so leave those as zero. If day not known, leave as 01 since 00 not possible."
- temporal_valid should be true only if time_start(source) < time_start(target), or unknown.
- Return ONLY JSON that matches the schema.
"""

USER = """Document title: {title}
Optional context: {context}
Full text:
{doc}

Output format (must follow exactly):
{format_instructions}
"""

llm_prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM),
    ("user", USER)
])


In [31]:
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout
import asyncio, time, threading
# Example: OpenAI-compatible; replace with your provider
# from langchain_openai import ChatOpenAI
# llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Or your initialized model:
llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai", temperature=0)



executor = ThreadPoolExecutor(max_workers=1)
is_running = False
cancel_event = threading.Event()

def blocking_llm_to_cyjs(user_text: str, cancel_event: threading.Event) -> dict:
    """
    Replace this with your real LLM+parsing.
    It should periodically check cancel_event.is_set() and abort if set.
    Return a dict shaped as Cytoscape.js elements.
    """
    articles = WikipediaLoader(query=user_text, load_max_docs=1).load()
    messages = llm_prompt.format_messages(
        title="Sample Article",
        context="Historical cause-effect extraction.",
        doc=articles[0].page_content,
        format_instructions=format_instructions
    )
    
    raw = llm.invoke(messages)
    try:
        cy: CyElements = parser.parse(raw.content)
    except Exception as e:
        raise
    payload = {
        "elements": {
            "nodes": [n.model_dump(mode="json") for n in cy.nodes],
            "edges": [e.model_dump(mode="json") for e in cy.edges],
        },
       
    }
    return payload



In [32]:
async def run_pipeline(user_text: str, hard_timeout_sec: int = 300):
    global is_running
    try:
        is_running = True
        cancel_event.clear()
        run_btn.disabled = True
        cancel_btn.disabled = False
        prompt.disabled = True
        status.value = '<span style="color:#555;">Starting…</span>'
        show_spinner(True, 'Running LLM (duration unknown)…')

        start = time.time()

        async def heartbeat():
            while is_running:
                elapsed.value = f'Elapsed: {int(time.time() - start)}s'
                await asyncio.sleep(1)

        hb_task = asyncio.create_task(heartbeat())

        loop = asyncio.get_running_loop()
        fut = loop.run_in_executor(executor, blocking_llm_to_cyjs, user_text, cancel_event)

        try:
            cyjs = await asyncio.wait_for(fut, timeout=hard_timeout_sec)
        except asyncio.TimeoutError:
            cancel_event.set()
            status.value = '<b style="color:#d9534f;">Timed out</b>'
            raise
        finally:
            hb_task.cancel()

        load_cyjs_into_cyto(cyto, cyjs)
        status.value = '<b style="color:green;">Done</b>'

    except Exception as e:
        if str(e) == "Cancelled by user":
            status.value = '<b style="color:#f0ad4e;">Cancelled</b>'
        else:
            status.value = f'<b style="color:#d9534f;">Error: {e}</b>'
    finally:
        is_running = False
        show_spinner(False)
        run_btn.disabled = False
        cancel_btn.disabled = True
        prompt.disabled = False


In [33]:
def on_click_run(b):
    if not is_running:
        asyncio.ensure_future(run_pipeline(prompt.value))

def on_click_cancel(b):
    if is_running:
        cancel_event.set()

run_btn.on_click(on_click_run)
cancel_btn.on_click(on_click_cancel)
