In [2]:
# get data from datasets
from datasets import load_dataset
import pandas as pd
from rich import print

# Load the dataset
dataset = load_dataset("strickvl/isafpressreleases", split="train")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

# Print the first few rows of the DataFrame
print(df.head())

In [3]:
df.columns

Index(['name', 'eventrefnumber', 'text', 'StartDate', 'eventtype', 'province',
       'citydistrict', 'village', 'targetgroup', 'commander', 'position',
       'minkilled', 'mincaptured', 'capturedcharacterisation',
       'killedcharacterisation', 'killq', 'captureq', 'killcaptureraid',
       'airstrike', 'noshotsfired', 'dataprocessed', 'flagged', 'glossarymeta',
       'minleaderskilled', 'minfacilitatorskilled', 'minleaderscaptured',
       'minfacilitatorscaptured', 'leaderq'],
      dtype='object')

In [4]:
eventtype_options = df["eventtype"].unique().tolist()

print(eventtype_options)

In [5]:
dataset

Dataset({
    features: ['name', 'eventrefnumber', 'text', 'StartDate', 'eventtype', 'province', 'citydistrict', 'village', 'targetgroup', 'commander', 'position', 'minkilled', 'mincaptured', 'capturedcharacterisation', 'killedcharacterisation', 'killq', 'captureq', 'killcaptureraid', 'airstrike', 'noshotsfired', 'dataprocessed', 'flagged', 'glossarymeta', 'minleaderskilled', 'minfacilitatorskilled', 'minleaderscaptured', 'minfacilitatorscaptured', 'leaderq'],
    num_rows: 3616
})

In [6]:
province_options = df["province"].unique().tolist()
print(sorted(province_options))

# Load dataset into Pydantic models

In [7]:
# get data from datasets
from datasets import load_dataset
import pandas as pd
from rich import print

# Load the dataset
dataset = load_dataset("strickvl/isafpressreleases", split="train")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

In [8]:
dataset

Dataset({
    features: ['name', 'eventrefnumber', 'text', 'StartDate', 'eventtype', 'province', 'citydistrict', 'village', 'targetgroup', 'commander', 'position', 'minkilled', 'mincaptured', 'capturedcharacterisation', 'killedcharacterisation', 'killq', 'captureq', 'killcaptureraid', 'airstrike', 'noshotsfired', 'dataprocessed', 'flagged', 'glossarymeta', 'minleaderskilled', 'minfacilitatorskilled', 'minleaderscaptured', 'minfacilitatorscaptured', 'leaderq'],
    num_rows: 3616
})

## Pydantic models

In [9]:
from enum import Enum
from typing import Set
from pydantic import BaseModel, Field, field_validator, ValidationInfo
from datetime import date
from typing import Set, Annotated, Optional
from pydantic import BaseModel, Field, validator, ValidationInfo
from datetime import date


class EventType(str, Enum):
    airstrike = "airstrike"
    detention = "detention"
    captureandkill = "captureandkill"
    insurgentskilled = "insurgentskilled"
    exchangeoffire = "exchangeoffire"
    civiliancasualty = "civiliancasualty"


class Province(str, Enum):
    badakhshan = "badakhshan"
    badghis = "badghis"
    baghlan = "baghlan"
    balkh = "balkh"
    bamyan = "bamyan"
    day_kundi = "day_kundi"
    farah = "farah"
    faryab = "faryab"
    ghazni = "ghazni"
    ghor = "ghor"
    helmand = "helmand"
    herat = "herat"
    jowzjan = "jowzjan"
    kabul = "kabul"
    kandahar = "kandahar"
    kapisa = "kapisa"
    khost = "khost"
    kunar = "kunar"
    kunduz = "kunduz"
    laghman = "laghman"
    logar = "logar"
    nangarhar = "nangarhar"
    nimroz = "nimroz"
    nuristan = "nuristan"
    paktya = "paktya"
    paktika = "paktika"
    panjshir = "panjshir"
    parwan = "parwan"
    samangan = "samangan"
    sar_e_pul = "sar_e_pul"
    takhar = "takhar"
    uruzgan = "uruzgan"
    wardak = "wardak"
    zabul = "zabul"


class TargetGroup(str, Enum):
    taliban = "taliban"
    haqqani = "haqqani"
    criminals = "criminals"
    aq = "aq"
    hig = "hig"
    let = "let"
    imu = "imu"
    judq = "judq"
    iju = "iju"
    hik = "hik"
    ttp = "ttp"
    other = "other"


def validate_event_type(value: str):
    valid_values = [
        "airstrike",
        "detention",
        "captureandkill",
        "insurgentskilled",
        "exchangeoffire",
        "civiliancasualty",
    ]
    if value.lower() not in valid_values:
        return "other"
    return value.lower()


def validate_province(value: str):
    valid_values = [
        "badakhshan",
        "badghis",
        "baghlan",
        "balkh",
        "bamyan",
        "day_kundi",
        "farah",
        "faryab",
        "ghazni",
        "ghor",
        "helmand",
        "herat",
        "jowzjan",
        "kabul",
        "kandahar",
        "kapisa",
        "khost",
        "kunar",
        "kunduz",
        "laghman",
        "logar",
        "nangarhar",
        "nimroz",
        "nuristan",
        "paktya",
        "paktika",
        "panjshir",
        "parwan",
        "samangan",
        "sar_e_pul",
        "takhar",
        "uruzgan",
        "wardak",
        "zabul",
    ]
    if value.lower() not in valid_values:
        return "other"
    return value.lower()


def validate_target_group(value: str):
    valid_values = [
        "taliban",
        "haqqani",
        "criminals",
        "aq",
        "hig",
        "let",
        "imu",
        "judq",
        "iju",
        "hik",
        "ttp",
        "other",
    ]
    if value.lower() not in valid_values:
        return "other"
    return value.lower()


class IsafEvent(BaseModel):
    name: str = Field(
        description="A title or name for the event which summarises the event as a headline"
    )
    text: Optional[str] = Field(description="The full text of the press release")
    start_date: date = Field(
        description="The start date of the event in YYYY-MM-DD format"
    )
    event_type: Set[Annotated[str, Field(validator=validate_event_type)]] = Field(
        description="The event type. Can be multiple types."
    )
    province: Set[Annotated[str, Field(validator=validate_province)]] = Field(
        description="The province in which the event occurred. Can be multiple provinces."
    )
    target_group: Set[Annotated[str, Field(validator=validate_target_group)]] = Field(
        description="The group that was targetted during the event. Can be multiple groups."
    )
    min_killed: int = Field(
        description="The minimum number of people killed during the event"
    )
    min_captured: int = Field(
        description="The minimum number of people captured during the event"
    )
    killq: bool = Field(
        description="Whether someone was killed or not during the event"
    )
    captureq: bool = Field(
        description="Whether someone was captured or not during the event"
    )
    killcaptureraid: bool = Field(
        description="Whether the event was a so-called 'kill-capture raid'."
    )
    airstrike: bool = Field(
        description="Whether an airstrike was used during the event"
    )
    noshotsfired: bool = Field(
        description="Whether no shots were fired during the event"
    )
    min_leaders_killed: int = Field(
        description="The minimum number of leaders killed during the event"
    )
    min_leaders_captured: int = Field(
        description="The minimum number of leaders captured during the event"
    )

    class Config:
        arbitrary_types_allowed = True

## Populating the data as Pydantic models

In [10]:
from typing import List

events: List[IsafEvent] = []

for i, row in list(df.iterrows()):
    event_types = set(
        eventtype.strip().lower() for eventtype in row["eventtype"].split(",")
    )
    provinces = set(province.strip().lower() for province in row["province"].split(","))
    target_groups = set(
        target_group.strip().lower() for target_group in row["targetgroup"].split(",")
    )

    events.append(
        IsafEvent(
            name=row["name"],
            text=row["text"],
            start_date=row["StartDate"].to_pydatetime().date(),
            event_type=event_types,
            province=provinces,
            target_group=target_groups,
            min_killed=int(row["minkilled"]),
            min_captured=int(row["mincaptured"]),
            killq=row["killq"] == "true",
            captureq=row["captureq"] == "true",
            killcaptureraid=row["killcaptureraid"] == "true",
            airstrike=row["airstrike"] == "true",
            noshotsfired=row["noshotsfired"] == "true",
            min_leaders_killed=int(row["minleaderskilled"]),
            min_leaders_captured=int(row["minleaderscaptured"]),
        )
    )

# print(events[1800:1820])
print(events[:2])

In [11]:
from rich import inspect

# inspect(events[0], methods=True)

events[0].text
events[0].model_dump_json()

'{"name":"Several suspected insurgents detained in Ghazni","text":"ISAF Joint Command- Afghanistan\\n2011-07-S-060\\nFor Immediate Release\\n\\nFinally, in Ghazni district, Ghazni province, an Afghan-led security force detained several suspected insurgents while searching for a Taliban facilitator. The facilitator is responsible for procuring and disseminating weapons and supplies to insurgents in the area. The security force also confiscated vehicle-born improvised explosive device materials, grenades, an AK-47 assault rifle with several magazines and a chest rack.","start_date":"2011-07-18","event_type":["detention"],"province":["ghazni"],"target_group":["taliban"],"min_killed":0,"min_captured":3,"killq":false,"captureq":true,"killcaptureraid":true,"airstrike":false,"noshotsfired":false,"min_leaders_killed":0,"min_leaders_captured":0}'

# Create a new dataset for finetuning

In [None]:
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd
from rich import print
from typing import List
import json

# Load the dataset
dataset = load_dataset("strickvl/isafpressreleases")

def process_dataframe(df: pd.DataFrame) -> List[dict]:
    events: List[IsafEvent] = []

    for i, row in list(df.iterrows()):
        event_types = set(
            eventtype.strip().lower() for eventtype in row["eventtype"].split(",")
        )
        provinces = set(province.strip().lower() for province in row["province"].split(","))
        target_groups = set(
            target_group.strip().lower() for target_group in row["targetgroup"].split(",")
        )

        events.append(
            IsafEvent(
                name=row["name"],
                text=row["text"],
                start_date=row["StartDate"].to_pydatetime().date(),
                event_type=event_types,
                province=provinces,
                target_group=target_groups,
                min_killed=int(row["minkilled"]),
                min_captured=int(row["mincaptured"]),
                killq=row["killq"] == "true",
                captureq=row["captureq"] == "true",
                killcaptureraid=row["killcaptureraid"] == "true",
                airstrike=row["airstrike"] == "true",
                noshotsfired=row["noshotsfired"] == "true",
                min_leaders_killed=int(row["minleaderskilled"]),
                min_leaders_captured=int(row["minleaderscaptured"]),
            )
        )

    processed_data = [
        {
            "text": event.text,
            "json_string": event.json()
        }
        for event in events
    ]

    return processed_data

# Process the train and test DataFrames
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

train_data = process_dataframe(train_df)
test_data = process_dataframe(test_df)

# Create Datasets from the processed data
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# Create a DatasetDict with the train and test splits
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub("strickvl/isaf_press_releases_ft")

# Write JSONL data file

```json
{
    "conversations": [
        {
            "from": "system",
            "value": "Honeycomb is an observability platform that allows you to write queries to inspect trace data. You are an assistant that takes a natural language query (NLQ) and a list of valid columns and produce a Honeycomb query."
        },
        {
            "from": "human",
            "value": "\n\nNLQ: \"group by HTTP method\"\n\nColumns: ['query_string_num_tokens', 'query_string_length', 'data_queries', 'http.target', 'task.id', 'trace_root.http.target', 'topic', 'http.host', 'total_hits', 'db.user', 'domain_types', 'db.name', 'graphql.document', 'history', 'http.scheme', 'http.method', 'frontend.version', 'disposition_for_dBVVysC8x4Ymwg9rtjMckgw9', 'db.system', 'event_name', 'organization', 'auth.logout', 'organizations', 'name', 'net.transport', 'db.operation', 'disposition_for_UvsPPBVUn9FDuzDjsjYCqopq', 'disposition_for_1RUGSd7GdnP5tuKdgqBRZUm2', 'process.pid', 'disposition_for_6uyAoBc3PuvEcTTPFgPM3Rtk', 'exception.stacktrace', 'data_ingestion_individuals_count', 'disposition_for_qrnUBUz8YBfNX7Liekq6nKi3', 'task_type.type', 'disposition_for_JQDNbuUdaQcEbEwQNxUbV5EF', 'disposition_for_rAcWoXfbHw4eWoJFH4ZcY8ue', 'disposition_for_eShqQoC9jUi9VQBidpp2oXHP', 'parent_name', 'template', 'graphql.operation.name', 'span.num_links', 'disposition_for_kNSPtvsCWkDoEyFP2QE6VPmQ', 'disposition_for_UUqf9L1qkFxDNEvcgsVMA2yy', 'disposition_for_vwbbN76HZ7uitLubvkUjPFQE', 'disposition_for_aAto1pGrdF5RunpSX8sY5hvn', 'disposition_for_UbKCMdnkPQ6TuHrfdBo5juZu', 'disposition_for_QfrvmoHxSgLPJXPKZCrZfGo8', 'disposition_for_NoKSSruBRCX6UG28PzmkybUd', 'disposition_for_UZAqvZ5XVBZjKKWuMeRkRayS', 'organization_token', 'duration_ms', 'trace.parent_id', 'db.statement', 'exception.message', 'error', 'service.name', 'http.status_code', 'http.route']"
        },
        {
            "from": "gpt",
            "value": "\n{\"breakdowns\": [\"http.method\"], \"calculations\": [{\"op\": \"COUNT\"}], \"time_range\": 7200}"
        }
    ]
}
```

In [12]:
import os
from datasets import load_dataset
import pandas as pd
from rich import print
from typing import List
import json

# Load the dataset
dataset = load_dataset("strickvl/isafpressreleases")
target_file_path = "../data/isaf_press_releases_ft.jsonl"

def write_data_to_jsonl(df: pd.DataFrame) -> None:
    events: List[IsafEvent] = []

    for i, row in list(df.iterrows()):
        event_types = set(
            eventtype.strip().lower() for eventtype in row["eventtype"].split(",")
        )
        provinces = set(province.strip().lower() for province in row["province"].split(","))
        target_groups = set(
            target_group.strip().lower() for target_group in row["targetgroup"].split(",")
        )

        events.append(
            IsafEvent(
                name=row["name"],
                text=row["text"],
                start_date=row["StartDate"].to_pydatetime().date(),
                event_type=event_types,
                province=provinces,
                target_group=target_groups,
                min_killed=int(row["minkilled"]),
                min_captured=int(row["mincaptured"]),
                killq=row["killq"] == "true",
                captureq=row["captureq"] == "true",
                killcaptureraid=row["killcaptureraid"] == "true",
                airstrike=row["airstrike"] == "true",
                noshotsfired=row["noshotsfired"] == "true",
                min_leaders_killed=int(row["minleaderskilled"]),
                min_leaders_captured=int(row["minleaderscaptured"]),
            )
        )

    processed_data = [
        {
            "conversations":
            [
                {
                    "from": "system",
                    "value": "You are an expert at identifying events in a press release. You are precise and always make sure you are correct, drawing inference from the text of the press release. event_types = ['airstrike', 'detention', 'captureandkill', 'insurgentskilled', 'exchangeoffire', 'civiliancasualty'], provinces = ['badakhshan', 'badghis', 'baghlan', 'balkh', 'bamyan', 'day_kundi', 'farah', 'faryab', 'ghazni', 'ghor', 'helmand', 'herat', 'jowzjan', 'kabul', 'kandahar', 'kapisa', 'khost', 'kunar', 'kunduz', 'laghman', 'logar', 'nangarhar', 'nimroz', 'nuristan', 'paktya', 'paktika', 'panjshir', 'parwan', 'samangan', 'sar_e_pul', 'takhar', 'uruzgan', 'wardak', 'zabul'], target_groups = ['taliban', 'haqqani', 'criminals', 'aq', 'hig', 'let', 'imu', 'judq', 'iju', 'hik', 'ttp', 'other']"
                },
                {
                    "from": "human",
                    "value": f"PRESS RELEASE TEXT: {event.text}"
                },
                {
                    "from": "gpt",
                    "value": f"{event.model_dump_json(exclude={'text'})}"
                }
            ]
        }
        for event in events
    ]

    # Write the processed data to a JSONL file
    os.makedirs(os.path.dirname(target_file_path), exist_ok=True)
    with open(target_file_path, 'w') as f:
        for item in processed_data:
            f.write(json.dumps(item) + "\n")

train_df = pd.DataFrame(dataset["train"])
write_data_to_jsonl(train_df)
