[My last post](https://mlops.systems/posts/2024-06-25-evaluation-finetuning-manual-dataset.html) outlined the kinds of evaluation I need and want to understand how
well my finetuned LLM is performing in the task of structured data extraction
from press releases. Let's start with the core metric I'm interested in, accuracy,
and then dive into some of the other evaluation metrics as well.

## Loading the datasets

In [1]:
from datasets import load_dataset
import pandas as pd
from rich import print

test_dataset = load_dataset("strickvl/isafpressreleases", split="test")
test_df = pd.DataFrame(test_dataset)

In [2]:
test_dataset

Dataset({
    features: ['name', 'eventrefnumber', 'text', 'StartDate', 'eventtype', 'province', 'citydistrict', 'village', 'targetgroup', 'commander', 'position', 'minkilled', 'mincaptured', 'capturedcharacterisation', 'killedcharacterisation', 'killq', 'captureq', 'killcaptureraid', 'airstrike', 'noshotsfired', 'dataprocessed', 'flagged', 'glossarymeta', 'minleaderskilled', 'minfacilitatorskilled', 'minleaderscaptured', 'minfacilitatorscaptured', 'leaderq'],
    num_rows: 724
})

We'll first add an extra column to our `DataFrame` and then make a prediction for
each and every row in the dataset. We'll store a copy of the prediction to the
column so as to make sure we don't have to do this compute-intensive step
repeatedly.

But first we'll assemple the data as Pydantic objects so as to handle validation
and other quality of life features.

In [3]:
from enum import Enum
from typing import Dict, Set, Annotated, Optional
from pydantic import BaseModel, Field, validator, ValidationInfo
from datetime import date


class EventType(str, Enum):
    airstrike = "airstrike"
    detention = "detention"
    captureandkill = "captureandkill"
    insurgentskilled = "insurgentskilled"
    exchangeoffire = "exchangeoffire"
    civiliancasualty = "civiliancasualty"


class Province(str, Enum):
    badakhshan = "badakhshan"
    badghis = "badghis"
    baghlan = "baghlan"
    balkh = "balkh"
    bamyan = "bamyan"
    day_kundi = "day_kundi"
    farah = "farah"
    faryab = "faryab"
    ghazni = "ghazni"
    ghor = "ghor"
    helmand = "helmand"
    herat = "herat"
    jowzjan = "jowzjan"
    kabul = "kabul"
    kandahar = "kandahar"
    kapisa = "kapisa"
    khost = "khost"
    kunar = "kunar"
    kunduz = "kunduz"
    laghman = "laghman"
    logar = "logar"
    nangarhar = "nangarhar"
    nimroz = "nimroz"
    nuristan = "nuristan"
    paktya = "paktya"
    paktika = "paktika"
    panjshir = "panjshir"
    parwan = "parwan"
    samangan = "samangan"
    sar_e_pul = "sar_e_pul"
    takhar = "takhar"
    uruzgan = "uruzgan"
    wardak = "wardak"
    zabul = "zabul"


class TargetGroup(str, Enum):
    taliban = "taliban"
    haqqani = "haqqani"
    criminals = "criminals"
    aq = "aq"
    hig = "hig"
    let = "let"
    imu = "imu"
    judq = "judq"
    iju = "iju"
    hik = "hik"
    ttp = "ttp"
    other = "other"


def validate_event_type(value: str):
    valid_values = [
        "airstrike",
        "detention",
        "captureandkill",
        "insurgentskilled",
        "exchangeoffire",
        "civiliancasualty",
    ]
    if value.lower() not in valid_values:
        return "other"
    return value.lower()


def validate_province(value: str):
    valid_values = [
        "badakhshan",
        "badghis",
        "baghlan",
        "balkh",
        "bamyan",
        "day_kundi",
        "farah",
        "faryab",
        "ghazni",
        "ghor",
        "helmand",
        "herat",
        "jowzjan",
        "kabul",
        "kandahar",
        "kapisa",
        "khost",
        "kunar",
        "kunduz",
        "laghman",
        "logar",
        "nangarhar",
        "nimroz",
        "nuristan",
        "paktya",
        "paktika",
        "panjshir",
        "parwan",
        "samangan",
        "sar_e_pul",
        "takhar",
        "uruzgan",
        "wardak",
        "zabul",
    ]
    if value.lower() not in valid_values:
        return "other"
    return value.lower()


def validate_target_group(value: str):
    valid_values = [
        "taliban",
        "haqqani",
        "criminals",
        "aq",
        "hig",
        "let",
        "imu",
        "judq",
        "iju",
        "hik",
        "ttp",
        "other",
    ]
    if value.lower() not in valid_values:
        return "other"
    return value.lower()


class IsafEvent(BaseModel):
    name: str = Field(
        description="A title or name for the event which summarises the event as a headline"
    )
    text: Optional[str] = Field(description="The full text of the press release")
    start_date: date = Field(
        description="The start date of the event in YYYY-MM-DD format"
    )
    event_type: Set[Annotated[str, Field(validator=validate_event_type)]] = Field(
        description="The event type. Can be multiple types."
    )
    province: Set[Annotated[str, Field(validator=validate_province)]] = Field(
        description="The province in which the event occurred. Can be multiple provinces."
    )
    target_group: Set[Annotated[str, Field(validator=validate_target_group)]] = Field(
        description="The group that was targetted during the event. Can be multiple groups."
    )
    min_killed: int = Field(
        description="The minimum number of people killed during the event"
    )
    min_captured: int = Field(
        description="The minimum number of people captured during the event"
    )
    killq: bool = Field(
        description="Whether someone was killed or not during the event"
    )
    captureq: bool = Field(
        description="Whether someone was captured or not during the event"
    )
    killcaptureraid: bool = Field(
        description="Whether the event was a so-called 'kill-capture raid'."
    )
    airstrike: bool = Field(
        description="Whether an airstrike was used during the event"
    )
    noshotsfired: bool = Field(
        description="Whether no shots were fired during the event"
    )
    min_leaders_killed: int = Field(
        description="The minimum number of leaders killed during the event"
    )
    min_leaders_captured: int = Field(
        description="The minimum number of leaders captured during the event"
    )
    predictions: Dict[str, str] = Field(
        default={},
        description="The predictions from the model. Keys are the model name and the value is the prediction",
    )

    class Config:
        arbitrary_types_allowed = True

Here's what a couple of examples of our training data looks like as Pydantic
models when we pass them in:

In [4]:
from typing import List

events: List[IsafEvent] = []

for i, row in list(test_df.iterrows()):
    event_types = set(
        eventtype.strip().lower() for eventtype in row["eventtype"].split(",")
    )
    provinces = set(province.strip().lower() for province in row["province"].split(","))
    target_groups = set(
        target_group.strip().lower() for target_group in row["targetgroup"].split(",")
    )

    events.append(
        IsafEvent(
            name=row["name"],
            text=row["text"],
            start_date=row["StartDate"].to_pydatetime().date(),
            event_type=event_types,
            province=provinces,
            target_group=target_groups,
            min_killed=int(row["minkilled"]),
            min_captured=int(row["mincaptured"]),
            killq=row["killq"] == "true",
            captureq=row["captureq"] == "true",
            killcaptureraid=row["killcaptureraid"] == "true",
            airstrike=row["airstrike"] == "true",
            noshotsfired=row["noshotsfired"] == "true",
            min_leaders_killed=int(row["minleaderskilled"]),
            min_leaders_captured=int(row["minleaderscaptured"]),
        )
    )

print(events[:2])

So when we're making the prediction we're hoping to get a JSON string like this
out from the model:

In [5]:
json_str = events[0].model_dump_json(exclude={"text", "predictions"})
print(json_str)

I'm starting with full evaluations using the GPT models and I'll need a
slightly more elaborate prompt in order to get decent results. I can't pass in
the exact same prompt as the one I used for the finetuned model since the GPT
models haven't been trained or finetuned to respond to those specific prompts.
This is sort of an interesting problem to have: how much effort do we put into
the GPT prompts to try to get the same level of accuracy as the finetuned model?
Or in other words, is there even a way to really compare like to like between
models that must accept different prompts?

Let's try this out for OpenAI GPT-4o and GPT-4 Turbo and see how we get on.
You'll note how long the prompt has to be to give the GPT models a fighting
chance against the finetuned models. Ideally I'd stuff in even more examples
into the context, but I also don't want to explode the number of tokens I'm
using.

In [30]:
from openai import OpenAI
from rich import print
import json
import os


def query_openai(article_text: str, model: str) -> str:
    query = (
        f"The following is a press release issued by ISAF (formerly operating in Afghanistan):\n{article_text}\n\n"
        "## Extraction request\n"
        "Please extract the following information from the press release:\n"
        "- The name of the event (summarising the event / text as a headline)\n"
        "- The start date of the event\n"
        "- The event type(s)\n"
        "- The province(s) in which the event occurred\n"
        "- The target group(s) of the event\n"
        "- The minimum number of people killed during the event\n"
        "- The minimum number of people captured during the event\n"
        "- Whether someone was killed or not during the event\n"
        "- Whether someone was captured or not during the event\n"
        "- Whether the event was a so-called 'kill-capture raid'\n"
        "- Whether an airstrike was used during the event\n"
        "- Whether no shots were fired during the event\n"
        "- The minimum number of leaders killed during the event\n"
        "- The minimum number of leaders captured during the event\n\n"
        "## Annotation notes:\n"
        "- A 'faciliator' is not a leader.\n"
        "- If a press release states that 'insurgents' were detained without further "
        "details, assign a minimum number of two detained. Interpret 'a couple' as "
        "two. Interpret 'several' as at least three, even though it may sometimes "
        "refer to seven or eight. Classify the terms 'a few', 'some', 'a group', 'a "
        "small group', and 'multiple' as denoting at least three, even if they "
        "sometimes refer to larger numbers. Choose the smaller number if no other "
        "information is available in the press release to come up with a minimally "
        "acceptable figure. Interpret 'numerous' and 'a handful' as at least four, "
        "and 'a large number' as at least five.\n\n"
        "## Example:\n"
        "Article text: 'ISAF Joint Command Evening Operational Update Feb. 19, 2011\nISAF Joint Command - "
        "Afghanistan\u20282011-02-S-143\u2028For Immediate Release \u2028\u2028KABUL, Afghanistan (Feb. 19)\u2028\u2028ISAF "
        "service members at a compound in Sangin district, Helmand province observed numerous insurgents north and south of "
        "their position talking on radios today. After gaining positive identification of the insurgent positions, the "
        "coalition troops engaged, killing several insurgents. Later, the ISAF troops observed more insurgents positioning "
        "in the area with weapons. After positive identification, coalition forces continued firing on the various insurgent "
        "positions, resulting in several more insurgents being killed.'\n\n"
        'Output: `{"name":"Several insurgents killed in '
        'Helmand","start_date":"2011-02-18","event_type":["insurgentskilled"],"province":["helmand"],"target_group":[""],"mi'
        'n_killed":6,"min_captured":0,"killq":true,"captureq":false,"killcaptureraid":false,"airstrike":false,"noshotsfired"'
        ':false,"min_leaders_killed":0,"min_leaders_captured":0}`'
    )

    # set up the prediction harness
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    response = client.chat.completions.create(
        model=model,
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": "You are an expert at identifying events in a press release. You are precise "
                "and always make sure you are correct, drawing inference from the text of the "
                "press release.\n\n You always return a JSON string with the following schema: "
                "## JSON Schema details\n"
                "Here is some of the schema for the JSON output string you "
                "should make use of: event_types = ['airstrike', 'detention', "
                "'captureandkill', 'insurgentskilled', 'exchangeoffire', 'civiliancasualty'], "
                "provinces = ['badakhshan', 'badghis', 'baghlan', 'balkh', 'bamyan', "
                "'day_kundi', 'farah', 'faryab', 'ghazni', 'ghor', 'helmand', 'herat', "
                "'jowzjan', 'kabul', 'kandahar', 'kapisa', 'khost', 'kunar', 'kunduz', "
                "'laghman', 'logar', 'nangarhar', 'nimroz', 'nuristan', 'paktya', 'paktika', "
                "'panjshir', 'parwan', 'samangan', 'sar_e_pul', 'takhar', 'uruzgan', "
                "'wardak', 'zabul'], target_groups = ['taliban', 'haqqani', 'criminals', "
                "'aq', 'hig', 'let', 'imu', 'judq', 'iju', 'hik', 'ttp', 'other']\n\n",
            },
            {"role": "user", "content": query},
        ],
        temperature=1,
    )

    return response.choices[0].message.content

We can make sure this function works with a quick example:

In [7]:
json_str = query_openai(events[0].text, "gpt-4o")
print(json.loads(json_str))

Our model is working (as expected) and we're also getting a JSON string back.
Let's assemble something that will iterate through all of our test data, get
predictions, and then store those predictions on our Pydantic object.

For the bulk predictions, we'll make sure to do this async, since there are lots of events and we don't
want to waiting all day. You'll see I also had to add some retries to the function
to account for rate limiting on the GPT-3.5-turbo model.

In [9]:
# | code-fold: true
# make async work within a notebook
import nest_asyncio

nest_asyncio.apply()

In [None]:
import aiohttp
import asyncio
from typing import List
from openai import OpenAI


async def async_query_openai(
    session,
    article_text: str,
    model: str,
    max_retries: int = 3,
    retry_delay: float = 1.0,
) -> str:
    query = (
        f"The following is a press release issued by ISAF (formerly operating in Afghanistan):\n{article_text}\n\n"
        "## Extraction request\n"
        "Please extract the following information from the press release:\n"
        "- The name of the event (summarising the event / text as a headline)\n"
        "- The start date of the event\n"
        "- The event type(s)\n"
        "- The province(s) in which the event occurred\n"
        "- The target group(s) of the event\n"
        "- The minimum number of people killed during the event\n"
        "- The minimum number of people captured during the event\n"
        "- Whether someone was killed or not during the event\n"
        "- Whether someone was captured or not during the event\n"
        "- Whether the event was a so-called 'kill-capture raid'\n"
        "- Whether an airstrike was used during the event\n"
        "- Whether no shots were fired during the event\n"
        "- The minimum number of leaders killed during the event\n"
        "- The minimum number of leaders captured during the event\n\n"
        "## Annotation notes:\n"
        "- A 'faciliator' is not a leader.\n"
        "- If a press release states that 'insurgents' were detained without further "
        "details, assign a minimum number of two detained. Interpret 'a couple' as "
        "two. Interpret 'several' as at least three, even though it may sometimes "
        "refer to seven or eight. Classify the terms 'a few', 'some', 'a group', 'a "
        "small group', and 'multiple' as denoting at least three, even if they "
        "sometimes refer to larger numbers. Choose the smaller number if no other "
        "information is available in the press release to come up with a minimally "
        "acceptable figure. Interpret 'numerous' and 'a handful' as at least four, "
        "and 'a large number' as at least five.\n\n"
        "## Example:\n"
        "Article text: 'ISAF Joint Command Evening Operational Update Feb. 19, 2011\nISAF Joint Command - "
        "Afghanistan\u20282011-02-S-143\u2028For Immediate Release \u2028\u2028KABUL, Afghanistan (Feb. 19)\u2028\u2028ISAF "
        "service members at a compound in Sangin district, Helmand province observed numerous insurgents north and south of "
        "their position talking on radios today. After gaining positive identification of the insurgent positions, the "
        "coalition troops engaged, killing several insurgents. Later, the ISAF troops observed more insurgents positioning "
        "in the area with weapons. After positive identification, coalition forces continued firing on the various insurgent "
        "positions, resulting in several more insurgents being killed.'\n\n"
        'Output: `{"name":"Several insurgents killed in '
        'Helmand","start_date":"2011-02-18","event_type":["insurgentskilled"],"province":["helmand"],"target_group":[""],"mi'
        'n_killed":6,"min_captured":0,"killq":true,"captureq":false,"killcaptureraid":false,"airstrike":false,"noshotsfired"'
        ':false,"min_leaders_killed":0,"min_leaders_captured":0}`'
    )

    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    retries = 0
    while retries < max_retries:
        async with session.post(
            "https://api.openai.com/v1/chat/completions",
            headers={"Authorization": f"Bearer {client.api_key}"},
            json={
                "model": model,
                "response_format": {"type": "json_object"},
                "messages": [
                    {
                        "role": "system",
                        "content": "You are an expert at identifying events in a press release. You are precise "
                        "and always make sure you are correct, drawing inference from the text of the "
                        "press release.\n\n You always return a JSON string with the following schema: "
                        "## JSON Schema details\n"
                        "Here is some of the schema for the JSON output string you "
                        "should make use of: event_types = ['airstrike', 'detention', "
                        "'captureandkill', 'insurgentskilled', 'exchangeoffire', 'civiliancasualty'], "
                        "provinces = ['badakhshan', 'badghis', 'baghlan', 'balkh', 'bamyan', "
                        "'day_kundi', 'farah', 'faryab', 'ghazni', 'ghor', 'helmand', 'herat', "
                        "'jowzjan', 'kabul', 'kandahar', 'kapisa', 'khost', 'kunar', 'kunduz', "
                        "'laghman', 'logar', 'nangarhar', 'nimroz', 'nuristan', 'paktya', 'paktika', "
                        "'panjshir', 'parwan', 'samangan', 'sar_e_pul', 'takhar', 'uruzgan', "
                        "'wardak', 'zabul'], target_groups = ['taliban', 'haqqani', 'criminals', "
                        "'aq', 'hig', 'let', 'imu', 'judq', 'iju', 'hik', 'ttp', 'other']\n\n",
                    },
                    {"role": "user", "content": query},
                ],
                "temperature": 1,
            },
        ) as response:
            result = await response.json()
            if "error" in result:
                error_message = result["error"]["message"]
                if "Rate limit reached" in error_message:
                    # retry_delay_ms = float(
                    #     error_message.split("Please try again in ")[1].split("ms")[0]
                    # )
                    retry_delay_ms = 35000
                    retry_delay_seconds = retry_delay_ms / 1000
                    print(
                        f"Rate limit exceeded. Retrying in {retry_delay_seconds} seconds..."
                    )
                    await asyncio.sleep(retry_delay_seconds)
                    retries += 1
                    continue
                else:
                    print(f"Error during prediction.\nFull result object: {result}")
                    return ""
            try:
                return result["choices"][0]["message"]["content"]
            except KeyError:
                print(f"Error during prediction.\nFull result object: {result}")
                return ""

    print(f"Max retries exceeded for event.\nFull result object: {result}")
    return ""


async def get_gpt_predictions_async(
    model: str,
    events: List[IsafEvent],
    logging_n: int = 100,
    max_concurrent_requests: int = 5,
) -> List[IsafEvent]:
    async with aiohttp.ClientSession() as session:
        semaphore = asyncio.Semaphore(max_concurrent_requests)
        tasks = []
        for i, event in enumerate(events, start=1):
            if i % logging_n == 0:
                print(f"Predicting event {i} of {len(events)} using {model}")

            async def make_request(session, event):
                async with semaphore:
                    return await async_query_openai(
                        session, event.text, model, max_retries=5
                    )

            task = asyncio.ensure_future(make_request(session, event))
            tasks.append(task)

        predictions = await asyncio.gather(*tasks)
        for event, prediction in zip(events, predictions):
            event.predictions[model] = prediction

    return events


async def main():
    events_4o = await get_gpt_predictions_async(
        "gpt-4o", events, max_concurrent_requests=10
    )
    events_4turbo = await get_gpt_predictions_async(
        "gpt-4-turbo", events_4o, max_concurrent_requests=10
    )
    full_events = await get_gpt_predictions_async(
        "gpt-3.5-turbo", events_4turbo, max_concurrent_requests=10
    )


await main()

So as you can now see, we have three predictions attached to each event.

In [17]:
print(events[0])

I have all these predictions living in memory right now so it's probably a good
time to commit these to a dataset and push them to the Hugging Face Hub in case
the notebook crashes or my local machine shuts down or something else
unexpected.

I'll create a function to handle this as we'll be repeating this process for the
other models as well. It's a bit verbose but I thought it preferable so you can
see what's going on.

In [26]:
from datasets import Dataset


def convert_to_dataset(data: List[IsafEvent]) -> Dataset:
    names = []
    texts = []
    start_dates = []
    provinces = []
    target_groups = []
    event_types = []
    predictions = []
    min_killeds = []
    min_captureds = []
    killqs = []
    captureqs = []
    killcaptureraids = []
    airstrikes = []
    noshotsfireds = []
    min_leaders_killeds = []
    min_leaders_captureds = []

    for item in data:
        names.append(item.name)
        texts.append(item.text)
        start_dates.append(item.start_date)
        provinces.append(item.province)
        target_groups.append(item.target_group)
        event_types.append(item.event_type)
        predictions.append(item.predictions)
        min_killeds.append(item.min_killed)
        min_captureds.append(item.min_captured)
        killqs.append(item.killq)
        captureqs.append(item.captureq)
        killcaptureraids.append(item.killcaptureraid)
        airstrikes.append(item.airstrike)
        noshotsfireds.append(item.noshotsfired)
        min_leaders_killeds.append(item.min_leaders_killed)
        min_leaders_captureds.append(item.min_leaders_captured)

    dataset_dict = {
        "name": names,
        "text": texts,
        "predictions": predictions,
        "start_date": start_dates,
        "province": provinces,
        "target_group": target_groups,
        "event_type": event_types,
        "min_killed": min_killeds,
        "min_captured": min_captureds,
        "killq": killqs,
        "captureq": captureqs,
        "killcaptureraid": killcaptureraids,
        "airstrike": airstrikes,
        "noshotsfired": noshotsfireds,
        "min_leaders_killed": min_leaders_killeds,
        "min_leaders_captured": min_leaders_captureds,
    }
    dataset = Dataset.from_dict(dataset_dict)

    return dataset


def convert_and_push_dataset(events: List[IsafEvent], name: str):
    """Convert a list of Pydantic objects to a HF Dataset object, then push to
    the hub."""
    hf_token = os.getenv("HUGGINGFACE_API_KEY")

    dataset = convert_to_dataset(events)
    dataset.push_to_hub(
        f"strickvl/{name}",
        token=hf_token,
        private=True,
        create_pr=True,
    )

A more concise and abstract version of the `convert_to_dataset` function could
be something like:

```python
def convert_to_dataset(data: List[BaseModel]) -> Dataset:
    dataset_dict = {}

    for field_name, field_value in data[0].__fields__.items():
        field_type = field_value.outer_type_
        if field_type in [str, int, float, bool, date]:
            dataset_dict[field_name] = [getattr(item, field_name) for item in data]
        elif field_type == set:
            dataset_dict[field_name] = [list(getattr(item, field_name)) for item in data]
        elif issubclass(field_type, BaseModel):
            dataset_dict[field_name] = [getattr(item, field_name).dict() for item in data]
        else:
            dataset_dict[field_name] = [getattr(item, field_name) for item in data]

    dataset = Dataset.from_dict(dataset_dict)
    return dataset
```

But for now let's just push our data to the Hub.

In [None]:
convert_and_push_dataset(events, "isafpressreleases_with_preds")