# This script converts the raw training data that ChatGPT created into a JSON and a BIOS format.

In [None]:
import argparse
import re
import pandas as pd
import numpy as np

def convert_to_bio_and_json(user_message, chatbot_message, annotations, sep_token = "<USER_MESSAGE>"):
    # Convert the line to BIO format
    position = 0
    bio_line = []

    while position < len(chatbot_message):
        next_whitespace = chatbot_message.find(' ', position)
        if next_whitespace == -1:
            next_whitespace = len(chatbot_message) # No more whitespace, take the rest of the line
            
        next_span = chatbot_message[position:next_whitespace]
        bio_line.append((next_span, "O"))
        position += len(next_span) +1

    bio_line.append((sep_token, "O"))
    position = 0
    current_annotation = None
    current_annotation_type = None

    values = []
    while position < len(user_message):
        next_whitespace = user_message.find(' ', position)
        if next_whitespace == -1:
            next_whitespace = len(user_message) # No more whitespace, take the rest of the line 
        next_span = user_message[position:next_whitespace]
        
        # detect if we are in an annotation
        in_annotation = False
        for annotation, annotation_type, start_index, end_index, true_value in annotations:
            if position >= annotation.start() and position < annotation.end():
                in_annotation = True
                break
        
        if in_annotation:
            next_span = annotation.group()
            next_span = re.findall(r'\>.*\<', next_span)[0]
            next_span = next_span[1:-1]
            
            tokens = next_span.split(" ")
            for i in range(len(tokens)):
                token = tokens[i]
                if i == 0:
                    bio_line.append((token, f'B-{annotation_type}'))
                else:
                    bio_line.append((token, f'I-{annotation_type}'))

            position = annotation.end()+1

            value = {
                "start_index": start_index,
                "end_index": end_index,
                "true_value": true_value,
                "surface_value": next_span,
                "annotation_type": annotation_type}
            
            if annotation_type in ("LAST_SPRAY_DATE", "PLANT_DATE"):
                value["date_today"] = str(date_today)
                
            values.append(value)
        else:
            next_token = next_span
            position = next_whitespace + 1
            bio_line.append((next_token, 'O'))

    return bio_line, values

def parse_line(user_message, chatbot_message, date_today):

    annotation_types = ["TIMEX3", "LOCATION", "POTATO"]
    annotations = []

    for annotation_type in annotation_types:
        pattern = rf'<{annotation_type}\b[^>]*?>.*?<\/{annotation_type}>'
        for match in re.finditer(pattern, user_message):
            if annotation_type == "TIMEX3":
                # Extract the type attribute from TIMEX3 tags
                type_match = re.search(r'option="([^"]+)"', match.group())
                if type_match:
                    annotation_type = type_match.group(1)
                value = re.search(r'value="([^"]+)"', match.group())
                start_index = value.start()
                end_index = value.end()
                true_value = value.group(0)[len('value="'):-1]
            else:
                true_value = re.sub(r"</?[^>]+>", "", match.group())
                start_index = match.start()
                end_index = match.end()
            annotations.append((match, annotation_type, start_index, end_index, true_value))

    return convert_to_bio_and_json(user_message, chatbot_message, annotations)

chatbot_message = "When did you last spray your potatoes?"
user_message = 'I sprayed my <POTATO>Ndamira</POTATO> potatoes <TIMEX3 type="DATE" option="LAST_SPRAY_DATE" value="2025-05-19">5 weeks ago</TIMEX3>.'
date_today = "2025-06-28"
parse_line(user_message, chatbot_message, date_today)

([('When', 'O'),
  ('did', 'O'),
  ('you', 'O'),
  ('last', 'O'),
  ('spray', 'O'),
  ('your', 'O'),
  ('potatoes?', 'O'),
  ('<USER_MESSAGE>', '<USER_MESSAGE>'),
  ('I', 'O'),
  ('sprayed', 'O'),
  ('my', 'O'),
  ('Ndamira', 'B-POTATO'),
  ('potatoes', 'O'),
  ('5', 'B-LAST_SPRAY_DATE'),
  ('weeks', 'I-LAST_SPRAY_DATE'),
  ('ago', 'I-LAST_SPRAY_DATE')],
 [{'start_index': 13,
   'end_index': 37,
   'true_value': 'Ndamira',
   'surface_value': 'Ndamira',
   'annotation_type': 'POTATO'},
  {'start_index': 45,
   'end_index': 63,
   'true_value': '2025-05-19',
   'surface_value': '5 weeks ago',
   'annotation_type': 'LAST_SPRAY_DATE',
   'date_today': '2025-06-28'}])

In [95]:
import glob
import os

infiles = glob.glob("../data/english_examples/ner_raw_data/*.csv")

outfolder = "../data/english_examples/ner"
df_input = [pd.read_csv(infile) for infile in infiles]
df_input = pd.concat(df_input)
df_input["dataset"] = np.random.choice(("train", "valid", "test"), size=len(df_input), p=(0.8, 0.1, 0.1))
df_input["date_today"] = pd.to_datetime("2025-06-28")
outfile = os.path.join(outfolder, "ner.csv")
df_input["id"] = range(len(df_input))
df_input = df_input.reset_index()
df_input.to_csv(outfile)
df_input

Unnamed: 0,index,chatbot,user,dataset,date_today,id
0,0,When did you last spray your potatoes?,"I sprayed them <TIMEX3 type=""DATE"" option=""LAS...",train,2025-06-28,0
1,1,When did you plant your potatoes?,"<TIMEX3 type=""DATE"" option=""PLANT_DATE"" value=...",train,2025-06-28,1
2,2,When did you last spray your potatoes?,"I sprayed them <TIMEX3 type=""DATE"" option=""LAS...",valid,2025-06-28,2
3,3,When did you last spray your potatoes?,"I sprayed them <TIMEX3 type=""DATE"" option=""LAS...",train,2025-06-28,3
4,4,When did you plant your potatoes?,"<TIMEX3 type=""DATE"" option=""PLANT_DATE"" value=...",train,2025-06-28,4
...,...,...,...,...,...,...
19995,4995,Where is your farm located?,In <LOCATION>Gicumbi</LOCATION>.,train,2025-06-28,19995
19996,4996,When did you plant your potatoes?,"<TIMEX3 type=""DATE"" option=""PLANT_DATE"" value=...",train,2025-06-28,19996
19997,4997,Where is your farm located?,In <LOCATION>Rulindo</LOCATION>.,train,2025-06-28,19997
19998,4998,Which potato variety do you plant?,I plant <POTATO>German Butterball</POTATO>.,test,2025-06-28,19998


In [98]:
# convert to BIOS and JSON
import json

df_output = []
json_output = []
for ix, row in df_input.iterrows():
    bio, annotations = parse_line(row["user"], row["chatbot"], row["date_today"])
    for line in bio:
        df_output.append([ix, line[0], line[1], row["dataset"]])

    input_sentence = re.sub(r'<.*?>', '', row["user"])
    json_output.append({
        "sample_id": row["id"],
        "input_sentence": input_sentence,
        "preceeding_sentence": input_sentence,
        "annotations": annotations
    })

df_output = pd.DataFrame(df_output, columns=["sentence_id", "words", "labels", "dataset"])
outfile = os.path.join(outfolder, "bios.csv")
df_output.to_csv(outfile)
print("wrote " + outfile)

outfile = os.path.join(outfolder, "ner.json")
json.dump(json_output, open(outfile, "w"), indent=4)
print("wrote " + outfile)

df_output

wrote ../data/english_examples/ner/bios.csv
wrote ../data/english_examples/ner/ner.json


Unnamed: 0,sentence_id,words,labels,dataset
0,0,When,O,train
1,0,did,O,train
2,0,you,O,train
3,0,last,O,train
4,0,spray,O,train
...,...,...,...,...
229899,19999,sprayed,O,train
229900,19999,them,O,train
229901,19999,3,B-LAST_SPRAY_DATE,train
229902,19999,weeks,I-LAST_SPRAY_DATE,train
