We first need to take some samples and see how well GPT-4 does at extracting the data.

In [20]:
from rich import print

In [3]:
# get data from datasets
from datasets import load_dataset
import pandas as pd

# Load the dataset
dataset = load_dataset("strickvl/isafpressreleases", split="train")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

# Print the first few rows of the DataFrame
print(df.head())

Downloading data: 100%|██████████| 4822/4822 [00:10<00:00, 456.45files/s]  
Generating train split: 100%|██████████| 4822/4822 [00:02<00:00, 2232.76 examples/s]


                              name  eventrefnumber  \
0          Taliban Compound Struck  2009-11-CA-056   
1   Militants Detained in Kandahar  2009-11-CA-056   
2      Militants Detained in Khost  2009-12-CA–057   
3     Militants Detained in Wardak  2009-12-CA–057   
4  Insurgents Detained in Kandahar  2009-12-CA-058   

                                                text  StartDate  eventtype  \
0  Dec. 2: Taliban Compound Struck\n\nNEWS RELEAS... 2009-12-01  airstrike   
1  Militants Detained in Kandahar\nNEWS RELEASE I... 2009-12-02  detention   
2  Dec. 3: Militants Detained in Khwost\nNEWS REL... 2009-12-02  detention   
3  Dec. 3: Militants Detained in Wardak\n\n\n\nNE... 2009-12-03  detention   
4  Dec. 4: Insurgents Detained in Kandahar\nISAF ... 2009-12-04  detention   

   province     citydistrict       village targetgroup commander  ...  \
0     Kunar     Dara-ye Noor                   Taliban            ...   
1  Kandahar    Kandahar City                   Taliban      

In [4]:
df.columns

Index(['name', 'eventrefnumber', 'text', 'StartDate', 'eventtype', 'province',
       'citydistrict', 'village', 'targetgroup', 'commander', 'position',
       'minkilled', 'mincaptured', 'capturedcharacterisation',
       'killedcharacterisation', 'killq', 'captureq', 'killcaptureraid',
       'airstrike', 'noshotsfired', 'dataprocessed', 'flagged', 'glossarymeta',
       'minleaderskilled', 'minfacilitatorskilled', 'minleaderscaptured',
       'minfacilitatorscaptured', 'leaderq'],
      dtype='object')

In [5]:
eventtype_options = df['eventtype'].unique().tolist()

print(eventtype_options)

['airstrike', 'detention', 'captureandkill', 'insurgentskilled', 'exchangeoffire', '', 'civiliancasualty', '2010-07-CA-124', 'insurgentskilled;civiliancasualty', 'airstrike;detention', 'detention;airstrike', 'civiliancasualty;airstrike', 'airstrike;civiliancasualty', 'insurgentskilled;detention', 'detention;insurgentskilled']


In [14]:
from pydantic import BaseModel, Field
from datetime import date
from enum import Enum

class EventType(str, Enum):
    airstrike = "airstrike"
    detention = "detention"
    captureandkill = "captureandkill"
    insurgentskilled = "insurgentskilled"
    exchangeoffire = "exchangeoffire"
    civiliancasualty = "civiliancasualty"

class Province(str, Enum):
    badakhshan = "badakhshan"
    badghis = "badghis"
    baghlan = "baghlan"
    balkh = "balkh"
    bamyan = "bamyan"
    day_kundi = "day_kundi"
    farah = "farah"
    faryab = "faryab"
    ghazni = "ghazni"
    ghor = "ghor"
    helmand = "helmand"
    herat = "herat"
    jawzjan = "jawzjan"
    kabul = "kabul"
    kandahar = "kandahar"
    kapisa = "kapisa"
    khost = "khost"
    kunar = "kunar"
    kunduz = "kunduz"
    laghman = "laghman"
    logar = "logar"
    nangarhar = "nangarhar"
    nimroz = "nimroz"
    nuristan = "nuristan"
    paktia = "paktia"
    paktika = "paktika"
    panjshir = "panjshir"
    parwan = "parwan"
    samangan = "samangan"
    sar_e_pol = "sar_e_pol"
    takhar = "takhar"
    uruzgan = "uruzgan"
    wardak = "wardak"
    zabul = "zabul"

class IsafEvent(BaseModel):
    name: str = Field(description="A title or name for the event")
    start_date: date = Field(description="The start date of the event in YYYY-MM-DD format")
    end_date: date = Field(description="The end date of the event in YYYY-MM-DD format")
    event_type: EventType = Field(description="The event type")
    province: Province = Field(description="The province in which the event occurred")
    target_group: str = Field(description="The group that was targetted during the event.")
    min_killed: int = Field(description="The minimum number of people killed during the event")
    min_captured: int = Field(description="The minimum number of people captured during the event")
    killq: bool = Field(description="Whether someone was killed or not during the event")
    captureq: bool = Field(description="Whether someone was captured or not during the event")
    killcaptureraid: bool = Field(description="Whether the event was a so-called 'kill-capture raid'.")
    airstrike: bool = Field(description="Whether an airstrike was used during the event")
    noshotsfired: bool = Field(description="Whether no shots were fired during the event")
    min_leaders_killed: int = Field(description="The minimum number of leaders killed during the event")
    min_leaders_captured: int = Field(description="The minimum number of leaders captured during the event")

    class Config:
        arbitrary_types_allowed = True


In [15]:
article_text = df['text'][0]
article_text

"Dec. 2: Taliban Compound Struck\n\nNEWS RELEASE\u2028ISAF Joint Command - Afghanistan\u2028\u20282009-11-CA-056\u2028For Immediate Release\u2028\u2028KABUL, Afghanistan (Dec. 02) - International forces conducted an air strike against a Taliban commander in a remote area of eastern Afghanistan yesterday.\u2028\u2028The Taliban commander was the target of the precision strike in Kunar province's Dara Noor district, which occurred \u2028in an open area away from civilian compounds or infrastructure. \u2028Assessment of the strike continues."

In [16]:
query = f"""
The following is a press release issued by ISAF (formerly operating in Afghanistan):
{article_text}

Please extract the following information from the press release:
- The name of the event
- The start date of the event
- The end date of the event
- The event type
- The province in which the event occurred
- The target group of the event
- The minimum number of people killed during the event
- The minimum number of people captured during the event
- Whether someone was killed or not during the event
- Whether someone was captured or not during the event
- Whether the event was a so-called 'kill-capture raid'
- Whether an airstrike was used during the event
- Whether no shots were fired during the event
- The minimum number of leaders killed during the event
- The minimum number of leaders captured during the event
"""

In [21]:
import instructor
import datetime
from openai import OpenAI

# patch the client to add `response_model` to the `create` method
client = instructor.patch(OpenAI(), mode=instructor.Mode.MD_JSON)

openai_resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "user",
            "content": query,
        },
    ],
    response_model=IsafEvent,
)

print(openai_resp)

In [22]:
# enables `response_model` in create call
client = instructor.from_openai(
    OpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama",  # required, but unused
    ),
    mode=instructor.Mode.JSON,
)

mixtral_resp = client.chat.completions.create(
    model="mixtral",
    messages=[
        {
            "role": "user",
            "content": query,
        }
    ],
    response_model=IsafEvent,
)
print(mixtral_resp)