In [1]:
# get data from datasets
from datasets import load_dataset
import pandas as pd
from rich import print

# Load the dataset
dataset = load_dataset("strickvl/isafpressreleases", split="train")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

# Print the first few rows of the DataFrame
print(df.head())

Resolving data files:   0%|          | 0/4823 [00:00<?, ?it/s]

In [25]:
df.columns

Index(['name', 'eventrefnumber', 'text', 'StartDate', 'eventtype', 'province',
       'citydistrict', 'village', 'targetgroup', 'commander', 'position',
       'minkilled', 'mincaptured', 'capturedcharacterisation',
       'killedcharacterisation', 'killq', 'captureq', 'killcaptureraid',
       'airstrike', 'noshotsfired', 'dataprocessed', 'flagged', 'glossarymeta',
       'minleaderskilled', 'minfacilitatorskilled', 'minleaderscaptured',
       'minfacilitatorscaptured', 'leaderq'],
      dtype='object')

In [None]:
eventtype_options = df['eventtype'].unique().tolist()

print(eventtype_options)

In [3]:
dataset

Dataset({
    features: ['name', 'eventrefnumber', 'text', 'StartDate', 'eventtype', 'province', 'citydistrict', 'village', 'targetgroup', 'commander', 'position', 'minkilled', 'mincaptured', 'capturedcharacterisation', 'killedcharacterisation', 'killq', 'captureq', 'killcaptureraid', 'airstrike', 'noshotsfired', 'dataprocessed', 'flagged', 'glossarymeta', 'minleaderskilled', 'minfacilitatorskilled', 'minleaderscaptured', 'minfacilitatorscaptured', 'leaderq'],
    num_rows: 4822
})

In [12]:
province_options = df['province'].unique().tolist()
print(sorted(province_options))



# Split the dataset into train/test

In [5]:
import datasets

dataset = load_dataset("strickvl/isafpressreleases")
# Perform the train-test split
train_test_split = dataset["train"].train_test_split(test_size=0.25)

# Create a new dataset with the train and test splits
new_dataset = datasets.DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})

# Push the new dataset to the Hugging Face Hub
new_dataset.push_to_hub("strickvl/isafpressreleases", create_pr=True)

Resolving data files:   0%|          | 0/4823 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/strickvl/isafpressreleases/commit/1d7154c648657896a1ccddbd13e3074cd97536f6', commit_message='Upload dataset', commit_description='', oid='1d7154c648657896a1ccddbd13e3074cd97536f6', pr_url='https://huggingface.co/datasets/strickvl/isafpressreleases/discussions/2', pr_revision='refs/pr/2', pr_num=2)

# Shift the train-test split a bit

In [2]:
import datasets
from datasets import load_dataset

dataset = load_dataset("strickvl/isafpressreleases")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# merge these two datasets
merged_dataset = datasets.concatenate_datasets([train_dataset, test_dataset])

# Perform the train-test split
train_test_split = merged_dataset.train_test_split(test_size=0.15)

# Create a new dataset with the train and test splits
new_dataset = datasets.DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})

# Push the new dataset to the Hugging Face Hub
new_dataset.push_to_hub("strickvl/isafpressreleases", create_pr=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/strickvl/isafpressreleases/commit/28fe9c2ecae245b0af29675ad8b9056938caee85', commit_message='Upload dataset', commit_description='', oid='28fe9c2ecae245b0af29675ad8b9056938caee85', pr_url='https://huggingface.co/datasets/strickvl/isafpressreleases/discussions/3', pr_revision='refs/pr/3', pr_num=3)

# Load dataset into Pydantic models

In [7]:
# get data from datasets
from datasets import load_dataset
import pandas as pd
from rich import print

# Load the dataset
dataset = load_dataset("strickvl/isafpressreleases", split="train")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/453k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3616 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1206 [00:00<?, ? examples/s]

In [8]:
dataset

Dataset({
    features: ['name', 'eventrefnumber', 'text', 'StartDate', 'eventtype', 'province', 'citydistrict', 'village', 'targetgroup', 'commander', 'position', 'minkilled', 'mincaptured', 'capturedcharacterisation', 'killedcharacterisation', 'killq', 'captureq', 'killcaptureraid', 'airstrike', 'noshotsfired', 'dataprocessed', 'flagged', 'glossarymeta', 'minleaderskilled', 'minfacilitatorskilled', 'minleaderscaptured', 'minfacilitatorscaptured', 'leaderq'],
    num_rows: 3616
})

## Pydantic models

In [13]:
from pydantic import BaseModel, Field
from datetime import date
from enum import Enum

class EventType(str, Enum):
    airstrike = "airstrike"
    detention = "detention"
    captureandkill = "captureandkill"
    insurgentskilled = "insurgentskilled"
    exchangeoffire = "exchangeoffire"
    civiliancasualty = "civiliancasualty"

class Province(str, Enum):
    badakhshan = "badakhshan"
    badghis = "badghis"
    baghlan = "baghlan"
    balkh = "balkh"
    bamyan = "bamyan"
    day_kundi = "day_kundi"
    farah = "farah"
    faryab = "faryab"
    ghazni = "ghazni"
    ghor = "ghor"
    helmand = "helmand"
    herat = "herat"
    jowzjan = "jowzjan"
    kabul = "kabul"
    kandahar = "kandahar"
    kapisa = "kapisa"
    khost = "khost"
    kunar = "kunar"
    kunduz = "kunduz"
    laghman = "laghman"
    logar = "logar"
    nangarhar = "nangarhar"
    nimroz = "nimroz"
    nuristan = "nuristan"
    paktya = "paktya"
    paktika = "paktika"
    panjshir = "panjshir"
    parwan = "parwan"
    samangan = "samangan"
    sar_e_pul = "sar_e_pul"
    takhar = "takhar"
    uruzgan = "uruzgan"
    wardak = "wardak"
    zabul = "zabul"

class IsafEvent(BaseModel):
    name: str = Field(description="A title or name for the event")
    start_date: date = Field(description="The start date of the event in YYYY-MM-DD format")
    end_date: date = Field(description="The end date of the event in YYYY-MM-DD format")
    event_type: EventType = Field(description="The event type")
    province: Province = Field(description="The province in which the event occurred")
    target_group: str = Field(description="The group that was targetted during the event.")
    min_killed: int = Field(description="The minimum number of people killed during the event")
    min_captured: int = Field(description="The minimum number of people captured during the event")
    killq: bool = Field(description="Whether someone was killed or not during the event")
    captureq: bool = Field(description="Whether someone was captured or not during the event")
    killcaptureraid: bool = Field(description="Whether the event was a so-called 'kill-capture raid'.")
    airstrike: bool = Field(description="Whether an airstrike was used during the event")
    noshotsfired: bool = Field(description="Whether no shots were fired during the event")
    min_leaders_killed: int = Field(description="The minimum number of leaders killed during the event")
    min_leaders_captured: int = Field(description="The minimum number of leaders captured during the event")

    class Config:
        arbitrary_types_allowed = True


In [10]:
article_text = df['text'][0]
article_text

'ISAF Joint Command- Afghanistan\n2011-07-S-060\nFor Immediate Release\n\nFinally, in Ghazni district, Ghazni province, an Afghan-led security force detained several suspected insurgents while searching for a Taliban facilitator. The facilitator is responsible for procuring and disseminating weapons and supplies to insurgents in the area. The security force also confiscated vehicle-born improvised explosive device materials, grenades, an AK-47 assault rifle with several magazines and a chest rack.'

## Populating the data as Pydantic models

In [29]:
df.iloc[0]['StartDate']

df.iloc[0]['StartDate'].to_pydatetime().date()

datetime.date(2011, 7, 18)

In [None]:
from typing import List

events: List[IsafEvent] = []

for i, row in list(df.iterrows())[:3]:
    events.append(
        IsafEvent(
            name=row['name'],
            start_date=row['StartDate'].to_pydatetime().date(),
            end_date=row['EndDate'].to_pydatetime().date(),
            event_type=row['eventtype'],
            province=row['province'],
            target_group=row['target_group'],
            min_killed=row['min_killed'],
            min_captured=row['min_captured'],
            killq=row['killq'],
            captureq=row['captureq'],
            killcaptureraid=row['killcaptureraid'],
            airstrike=row['airstrike'],
            noshotsfired=row['noshotsfired'],
            min_leaders_killed=row['min_leaders_killed'],
            min_leaders_captured=row['min_leaders_captured'],
        )
    )

print(events)

In [None]:
query = f"""
The following is a press release issued by ISAF (formerly operating in Afghanistan):
{article_text}

Please extract the following information from the press release:
- The name of the event
- The start date of the event
- The end date of the event
- The event type
- The province in which the event occurred
- The target group of the event
- The minimum number of people killed during the event
- The minimum number of people captured during the event
- Whether someone was killed or not during the event
- Whether someone was captured or not during the event
- Whether the event was a so-called 'kill-capture raid'
- Whether an airstrike was used during the event
- Whether no shots were fired during the event
- The minimum number of leaders killed during the event
- The minimum number of leaders captured during the event
"""

In [None]:
import instructor
import datetime
from openai import OpenAI

# patch the client to add `response_model` to the `create` method
client = instructor.patch(OpenAI(), mode=instructor.Mode.MD_JSON)

openai_resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "user",
            "content": query,
        },
    ],
    response_model=IsafEvent,
)

print(openai_resp)

In [None]:
# enables `response_model` in create call
client = instructor.from_openai(
    OpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama",  # required, but unused
    ),
    mode=instructor.Mode.JSON,
)

mixtral_resp = client.chat.completions.create(
    model="mixtral",
    messages=[
        {
            "role": "user",
            "content": query,
        }
    ],
    response_model=IsafEvent,
)
print(mixtral_resp)