In [1]:
from argparse import Namespace
from dataclasses import dataclass
from pathlib import Path

import dspy
from bs4 import BeautifulSoup
from IPython.display import HTML

In [2]:
args = Namespace()
args.html_dir = Path("..") / "data" / "fna" / "Asteraceae"
args.model = "ollama_chat/qwq"
args.api_base = "http://localhost:11434"
args.api_key = ""
args.limit = 3

In [3]:
@dataclass
class Traits:
    plant_size: str = ""
    leaf_shape: str = ""
    leaf_length: str = ""
    leaf_width: str = ""
    leaf_thickness: str = ""
    fruit_type: str = ""
    fruit_length: str = ""
    fruit_width: str = ""
    seed_length: str = ""
    seed_width: str = ""
    deciduousness: str = ""

In [4]:
class ExtractInfo(dspy.Signature):
    """Analyze species descriptions and extract trait information."""

    text: str = dspy.InputField()
    prompt: str = dspy.InputField()
    traits: Traits = dspy.OutputField(desc="Extracted traits")

In [5]:
PAGES = sorted(args.html_dir.glob("*.html"))

In [6]:
LM = dspy.LM(args.model, api_base=args.api_base, api_key=args.api_key)
dspy.configure(lm=LM)

In [7]:
MODULE = dspy.Predict(ExtractInfo)

In [8]:
prompt = """
    What is the plant size,
    leaf shape, leaf length, leaf width, leaf thickness,
    fruit type, fruit length, fruit width,
    seed length, seed width,
    deciduousness?
    """

In [10]:
def extract_pages(pages, module, limit):
    for page in pages[:limit]:
        print()
        print("=" * 80)
        print(page.stem.replace("_", " "))

        with page.open() as f:
            text = f.read()

        soup = BeautifulSoup(text, features="lxml")
        treatment = soup.find("span", class_="statement")

        print()
        display(HTML(str(treatment)))
        print()

        reply = module(text=treatment, prompt=prompt)
        display(reply.traits)
        print()


extract_pages(PAGES, MODULE, args.limit)


Acanthospermum australe






Traits(plant_size='10–60(–120+) cm', leaf_shape='deltate to ± rhombic or ovate', leaf_length='13–37 mm', leaf_width='7–32 mm', leaf_thickness='', seed_length='', seed_width='', fruit_type='plumply ellipsoid to fusiform, weakly compressed, 5–7-ribbed, lacking terminal spines, prickles ± uncinate, mostly along ribs', fruit_length='7–9+ mm', fruit_width='', deciduousness='')



Acanthospermum hispidum






Traits(plant_size='10–60+ cm', leaf_shape='rhombic-ovate to obovate', leaf_length='20–150(–150+) mm', leaf_width='', leaf_thickness='', seed_length='', seed_width='', fruit_type='compressed, cuneate to obovate fruits', fruit_length='4–6+ mm', fruit_width='', deciduousness='')



Acanthospermum humile






Traits(plant_size='10–20(–30+) cm', leaf_shape='oval to lyrate', leaf_length='10–30(–45) mm', leaf_width='', leaf_thickness='', seed_length='', seed_width='', fruit_type='strongly compressed, ± cuneate', fruit_length='2–3(–4) mm', fruit_width='', deciduousness='')


