In [1]:
from argparse import Namespace
from dataclasses import dataclass
from pathlib import Path

import dspy
from bs4 import BeautifulSoup
from IPython.display import HTML

In [2]:
args = Namespace()
args.html_dir = Path("..") / "data" / "fna" / "Asteraceae"
args.model = "ollama_chat/qwq"
args.api_base = "http://localhost:11434"
args.api_key = ""
args.limit = 10

In [3]:
@dataclass
class Traits:
    plant_size: str = ""
    leaf_shape: str = ""
    leaf_length: str = ""
    leaf_width: str = ""
    leaf_thickness: str = ""
    fruit_type: str = ""
    fruit_length: str = ""
    fruit_width: str = ""
    seed_length: str = ""
    seed_width: str = ""
    deciduousness: str = ""

In [4]:
class ExtractInfo(dspy.Signature):
    """Analyze species descriptions and extract trait information."""

    text: str = dspy.InputField()
    prompt: str = dspy.InputField()
    traits: Traits = dspy.OutputField(desc="Extracted traits")

In [5]:
PAGES = sorted(args.html_dir.glob("*.html"))

In [6]:
LM = dspy.LM(args.model, api_base=args.api_base, api_key=args.api_key)
dspy.configure(lm=LM)

In [7]:
MODULE = dspy.Predict(ExtractInfo)

In [8]:
prompt = """
    What is the plant size,
    leaf shape, leaf length, leaf width, leaf thickness,
    fruit type, fruit length, fruit width,
    seed length, seed width,
    deciduousness?
    """

In [9]:
def extract_pages(pages, module, limit):
    for page in pages[:limit]:
        print()
        print("=" * 80)
        print(page.stem.replace("_", " "))

        with page.open() as f:
            text = f.read()

        soup = BeautifulSoup(text, features="lxml")
        treatment = soup.find("span", class_="statement")

        print()
        display(HTML(str(treatment)))
        print()
        
        reply = module(text=treatment, prompt=prompt)
        display(reply.traits)
        print()


extract_pages(PAGES, MODULE, args.limit)


Acanthospermum australe






Traits(plant_size='10–60(–120+) cm', leaf_shape='deltate to ± rhombic or ovate', leaf_length='13–37 mm', leaf_width='7–32 mm', leaf_thickness='', fruit_type='plumply ellipsoid to fusiform', fruit_length='7–9+ mm', fruit_width='', seed_length='', seed_width='', deciduousness='')



Acanthospermum hispidum






Traits(plant_size='10–60+ cm', leaf_shape='rhombic-ovate to obovate', leaf_length='20–150(–150+) mm', leaf_width='', leaf_thickness='', fruit_type='compressed, cuneate to obovate fruits', fruit_length='4–6+ mm', fruit_width='', seed_length='', seed_width='', deciduousness='')



Acanthospermum humile






Traits(plant_size='10–20(–30+) cm', leaf_shape='oval to lyrate', leaf_length='10–30(–45) mm', leaf_width='', leaf_thickness='', fruit_type='strongly compressed, ± cuneate', fruit_length='2–3(–4) mm', fruit_width='', seed_length='', seed_width='', deciduousness='')



Achillea millefolium






Traits(plant_size='6–65+ cm', leaf_shape='oblong or lanceolate, 1–2-pinnately lobed', leaf_length='3.5–35+ cm', leaf_width='5–35 mm', leaf_thickness='', fruit_type='cypselae', fruit_length='1–2 mm', fruit_width='', seed_length='', seed_width='', deciduousness='')



Acmella pusilla






Traits(plant_size='', leaf_shape='lanceolate', leaf_length='12–40 mm', leaf_width='3–10 mm', leaf_thickness='', fruit_type='cypsela', fruit_length='1.2–1.6 mm', fruit_width='', seed_length='', seed_width='', deciduousness='')



Acmella repens






Traits(plant_size='', leaf_shape='ovate to lance-ovate', leaf_length='20–40(–100) mm', leaf_width='10–35 mm', leaf_thickness='', fruit_type='Cypselae', fruit_length='1–2.5 mm', fruit_width='', seed_length='', seed_width='', deciduousness='')



Ageratina altissima






Traits(plant_size='(30–)50–80(–120)', leaf_shape='deltate-ovate to ovate or broadly lanceolate', leaf_length='4–11(–13) cm', leaf_width='2.5–8(–9) cm', leaf_thickness='', fruit_type='cypsela', fruit_length='', fruit_width='', seed_length='', seed_width='', deciduousness='')



Ageratina aromatica






Traits(plant_size='30–80(–100) cm', leaf_shape='narrowly to broadly deltate to nearly ovate or lanceolate', leaf_length='2–7(–9) cm', leaf_width='1.5–4 cm', leaf_thickness='subcoriaceous', fruit_type='cypsela', fruit_length='', fruit_width='', seed_length='', seed_width='', deciduousness='')



Ageratina jucunda






Traits(plant_size='40–80(–100) cm', leaf_shape='narrowly deltate to rhombic', leaf_length='2–6(–7) cm', leaf_width='1.5–4 cm', leaf_thickness='subcoriaceous', fruit_type='cypsela', fruit_length='', fruit_width='', seed_length='', seed_width='', deciduousness='')



Ageratum conyzoides






Traits(plant_size='20–150 cm', leaf_shape='ovate to elliptic-oblong', leaf_length='2–8 × 1–5 cm', leaf_width='', leaf_thickness='', fruit_type='cypselae', fruit_length='', fruit_width='', seed_length='', seed_width='', deciduousness='')


