# Text Classification - NER Inference with GPT-4o-mini Finetuned


(Open AI batch API)

---

---

## $\color{blue}{Sections:}$

* Preamble
1.   Admin
2.   Data
3.   Prompt
3.   JSONL
4.   Check Datasets
5.   Create batch job


## $\color{blue}{Preamble:}$

In this section we create a batch job with OpenAI to get preferential prices in exchange for a none instantaeous response. We get NER inferences for all the data.


## $\color{blue}{Admin}$
* Install relevant Libraries
* Import relevant Libraries

In [None]:
%%capture
!pip install tiktoken openai cohere

In [None]:
!pip install openai==1.55.3 httpx==0.27.2 --force-reinstall --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.6/389.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.4/70.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m351.8/351.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.7/431.7 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
pip install dill

Collecting dill
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.9-py3-none-any.whl (119 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m112.6/119.4 kB[0m [31m14.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.9


In [None]:
import openai
import re
import pandas as pd
import requests
import json
from google.colab import drive
from google.colab import userdata
from collections import defaultdict
import os
import dill

## $\color{blue}{Data}$

* Connect to Drive
* Load the data to a string

In [None]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive'

Mounted at /content/drive
/content/drive/MyDrive


In [None]:
import pandas as pd
path = 'class/datasets/'
df_train = pd.read_pickle(path + 'df_train_augmentation_ft')
df_dev = pd.read_pickle(path + 'df_dev_augmentation_ft')
df_test = pd.read_pickle(path + 'df_test_augmentation_ft')


In [None]:
df_dev.columns

Index(['master', 'book_idx', 'chapter_idx', 'content', 'vanilla_embedding.1',
       'direct_ft_augmented_embedding'],
      dtype='object')

# $\color{blue}{Prompt}$

----

The API requires data to be uploaded in this format.
The payload requires a system message (definition of LLM role), a user message (input prompt), and an assistant messages (expected output).

In [None]:
prompt = """The task is to label the Location and Person entities in the given ###Text section, Following the format in the ###Examples section.
The output should be identicle to the input with the exception of the Person and Location tags if required.

###Examples
Input: “Is it John of Tuam?”   “Are you sure of that now?” asked Mr Fogarty dubiously. “I thought it was some Italian or American.”
Output: “Is it @@John of Tuam##Person ?”   “Are you sure of that now?” asked @@Mr Fogarty##Person dubiously. “I thought it was some Italian or American.”

Input: sibly there were several others. He personally, being of a sceptical bias, believed and didn’t make the smallest bones about saying so either that man or men in the plural were always hanging around on the waiting list about a lady,
Output: sibly there were several others. He personally, being of a sceptical bias, believed and didn’t make the smallest bones about saying so either that man or men in the plural were always hanging around on the waiting list about a lady,

Input: Now to the historical, for as Madam Mina write not in her stenography, I must, in my cumbrous old fashion, that so each day of us may not go unrecorded. We got to the Borgo Pass just after sunrise yesterday morning.
Output: Now to the historical, for as @@Madam Mina##Person write not in her stenography, I must, in my cumbrous old fashion, that so each day of us may not go unrecorded. We got to the @@Borgo Pass##Location  just after sunrise yesterday morning.

**DON'T LABEL PRONOUNS AS PERSON**

###Text
Input: {}
Output:"""


system_message = """You are an excellent linguist."""


{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}

{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}

In [None]:
def format_data(df, set):
  dataset = []
  for i in range(df.shape[0]):
    point = {"custom_id": set + str(i),
             "method": "POST",
             "url": "/v1/chat/completions",
             "body": {"model": "ft:gpt-4o-mini-2024-07-18:personal::AbTiRIUJ",
                      "messages": [{"role": "system" , "content" : system_message},
                                   {"role": "user" , "content" : prompt.format(df.loc[i]['content'])}
                      ]
             }
    }

    dataset.append(point)

  return dataset

def save_to_jsonl(dataset, file_path):
  """
  Convert dataset into jsonl.

  Parameters
  ----------
  dataset : list
      List of dicts containing datapoint information.
  filepath: str
      File path to save to.

  Returns
  -------
  None
  """
  with open(file_path,"w") as file:
    for data in dataset:
      json_line = json.dumps(data)
      file.write(json_line + '\n')

##### $\color{red}{To-File}$


In [None]:
train_dataset = format_data(df_train, "train")
dev_dataset = format_data(df_dev, "dev")
test_dataset = format_data(df_test, "test")


In [None]:
len(train_dataset)

20474

In [None]:
save_to_jsonl(train_dataset, "class/datasets/train_augmented_openai_ner_inf.jsonl")
save_to_jsonl(dev_dataset, "class/datasets/dev_augmented_openai_ner_inf.jsonl")
save_to_jsonl(test_dataset, "class/datasets/test_augmented_openai_ner_inf.jsonl")

In [None]:
train_dataset[4]

{'custom_id': 'train4',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'ft:gpt-4o-mini-2024-07-18:personal::AbTiRIUJ',
  'messages': [{'role': 'system', 'content': 'You are an excellent linguist.'},
   {'role': 'user',
    'content': "The task is to label the Location and Person entities in the given ###Text section, Following the format in the ###Examples section.\nThe output should be identicle to the input with the exception of the Person and Location tags if required.\n\n###Examples\nInput: “Is it John of Tuam?”   “Are you sure of that now?” asked Mr Fogarty dubiously. “I thought it was some Italian or American.”\nOutput: “Is it @@John of Tuam##Person ?”   “Are you sure of that now?” asked @@Mr Fogarty##Person dubiously. “I thought it was some Italian or American.”\n\nInput: sibly there were several others. He personally, being of a sceptical bias, believed and didn’t make the smallest bones about saying so either that man or men in the plural were always han

# $\color{blue}{Check - Datasets}$

In [None]:
# Get example
def message_check(file_path, ind):
  """
  Check message from jsonl file.

  Parameters
  ----------
  filepath : str
      Path to jsonl file.
  ind: int
      Required ind for checking.

  Returns
  -------
  None
  """
  # Load the dataset
  with open(file_path, 'r', encoding='utf-8') as f:
      dataset = [json.loads(line) for line in f]

  # Initial dataset stats
  print("Num examples:", len(dataset))
  print("First example:")
  for message in dataset[ind]['body']["messages"]:
      print(message)

In [None]:
message_check("class/datasets/train_augmented_openai_ner_inf.jsonl",5050)

Num examples: 20474
First example:
{'role': 'system', 'content': 'You are an excellent linguist.'}
{'role': 'user', 'content': "The task is to label the Location and Person entities in the given ###Text section, Following the format in the ###Examples section.\nThe output should be identicle to the input with the exception of the Person and Location tags if required.\n\n###Examples\nInput: “Is it John of Tuam?”   “Are you sure of that now?” asked Mr Fogarty dubiously. “I thought it was some Italian or American.”\nOutput: “Is it @@John of Tuam##Person ?”   “Are you sure of that now?” asked @@Mr Fogarty##Person dubiously. “I thought it was some Italian or American.”\n\nInput: sibly there were several others. He personally, being of a sceptical bias, believed and didn’t make the smallest bones about saying so either that man or men in the plural were always hanging around on the waiting list about a lady,\nOutput: sibly there were several others. He personally, being of a sceptical bias, 

In [None]:
# Format error checks
def check_errors(file_path):
  """
  Check if there are any errors in file that will cause OpenAI training process to fail.

  Parameters
  ----------
  filepath : str
      Path to the json file.

  Returns
  -------
  None
  """
  with open(file_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

  format_errors = defaultdict(int)

  for ex in dataset:
      if not isinstance(ex, dict):
          format_errors["data_type"] += 1
          continue
      ex_body = ex.get("body", None)
      messages = ex_body.get("messages", None)
      if not messages:
          format_errors["missing_messages_list"] += 1
          continue

      for message in messages:
          if "role" not in message or "content" not in message:
              format_errors["message_missing_key"] += 1

          if any(k not in ("role", "content", "name", "function_call") for k in message):
              format_errors["message_unrecognized_key"] += 1

          if message.get("role", None) not in ("system", "user", "assistant", "function"):
              format_errors["unrecognized_role"] += 1

          content = message.get("content", None)
          function_call = message.get("function_call", None)

          if (not content and not function_call) or not isinstance(content, str):
              format_errors["missing_content"] += 1


  if format_errors:
      print("Found errors:")
      for k, v in format_errors.items():
          print(f"{k}: {v}")
  else:
      print("No errors found")

In [None]:
check_errors("class/datasets/train_augmented_openai_ner_inf.jsonl")

No errors found


# $\color{blue}{Create-Batch-Job}$

##### $\color{red}{Load-File}$

In [None]:
endpoint = "https://api.openai.com/v1/files" # endpoint for files

key = userdata.get('OPENAI_API_KEY')

headers = {'Authorization': f"Bearer {key}"}

def upload_file(file_path, endpoint, headers):
  """
  Upload a file to the OpenAI file system.

  Parameters
  ----------
  filepath : str
      Path to the json file.
  endpoint : str
      Use 'https://api.openai.com/v1/files'.
  headers : dict
      Use {'Authorization': f"Bearer {key}"}.

  Returns
  -------
  response : json
      Response from OpenAI confirming details of the upload.
  """
  with open(file_path,'rb') as f:
    response = requests.post(endpoint, headers=headers, files={'file': f}, data={'purpose': 'batch'})
  return response.json()

In [None]:
train_file_response = upload_file("class/datasets/train_augmented_openai_ner_inf.jsonl", endpoint, headers)
dev_file_response = upload_file("class/datasets/dev_augmented_openai_ner_inf.jsonl", endpoint, headers)
test_file_response = upload_file("class/datasets/test_augmented_openai_ner_inf.jsonl", endpoint, headers)

In [None]:
train_file_response

{'object': 'file',
 'id': 'file-8q4ANgJKt7bfyoo5XqGgby',
 'purpose': 'batch',
 'filename': 'train_augmented_openai_ner_inf.jsonl',
 'bytes': 44377016,
 'created_at': 1742482189,
 'expires_at': None,
 'status': 'processed',
 'status_details': None}

In [None]:
dev_file_response

{'object': 'file',
 'id': 'file-MNQD39tDEDjDVaMjNBW8wi',
 'purpose': 'batch',
 'filename': 'dev_augmented_openai_ner_inf.jsonl',
 'bytes': 1616290,
 'created_at': 1742482190,
 'expires_at': None,
 'status': 'processed',
 'status_details': None}

In [None]:
test_file_response

{'object': 'file',
 'id': 'file-DLe5iMf1HJAJBPSPNeEK1x',
 'purpose': 'batch',
 'filename': 'test_augmented_openai_ner_inf.jsonl',
 'bytes': 1087735,
 'created_at': 1742482191,
 'expires_at': None,
 'status': 'processed',
 'status_details': None}

In [None]:
train_file_response = {'id':'file-8q4ANgJKt7bfyoo5XqGgby'}
dev_file_response = {'id': 'file-MNQD39tDEDjDVaMjNBW8wi'}
test_file_response = {'id':'file-DLe5iMf1HJAJBPSPNeEK1x'}

##### $\color{red}{Create-Jobs}$

In [None]:
from openai import OpenAI
client = OpenAI(api_key= userdata.get('OPENAI_API_KEY'))

train_batch_object = client.batches.create(
    input_file_id=train_file_response['id'],
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "train_augmented_ner_responses"
    }
)

dev_batch_object = client.batches.create(
    input_file_id=dev_file_response['id'],
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "dev_augmented_ner_responses"
    }
)

test_batch_object = client.batches.create(
    input_file_id=test_file_response['id'],
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "test_augmented_ner_responses"
    }
)

get meta

In [None]:
train_batch_object

Batch(id='batch_67dc2b890a4c8190bd4156bb4240aeca', completion_window='24h', created_at=1742482313, endpoint='/v1/chat/completions', input_file_id='file-8q4ANgJKt7bfyoo5XqGgby', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1742568713, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'train_augmented_ner_responses'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [None]:
dev_batch_object

Batch(id='batch_67dc2b8929fc819097fdab12ca753efc', completion_window='24h', created_at=1742482313, endpoint='/v1/chat/completions', input_file_id='file-MNQD39tDEDjDVaMjNBW8wi', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1742568713, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev_augmented_ner_responses'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [None]:
test_batch_object

Batch(id='batch_67dc2b89670c8190bbebaada409fb945', completion_window='24h', created_at=1742482313, endpoint='/v1/chat/completions', input_file_id='file-DLe5iMf1HJAJBPSPNeEK1x', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1742568713, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'test_augmented_ner_responses'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

collect ids

In [None]:
train_batch_id = train_batch_object.id
train_batch_output_file = train_batch_object.output_file_id
dev_batch_id = dev_batch_object.id
dev_batch_output_file = dev_batch_object.output_file_id
test_batch_id = test_batch_object.id
test_batch_output_file = test_batch_object.output_file_id


retrieve status

In [None]:
client.batches.retrieve(train_batch_id)

Batch(id='batch_67dc2b890a4c8190bd4156bb4240aeca', completion_window='24h', created_at=1742482313, endpoint='/v1/chat/completions', input_file_id='file-8q4ANgJKt7bfyoo5XqGgby', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1742568713, failed_at=None, finalizing_at=None, in_progress_at=1742482317, metadata={'description': 'train_augmented_ner_responses'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=20474))

In [None]:
client.batches.retrieve(dev_batch_id)

Batch(id='batch_67dc2b8929fc819097fdab12ca753efc', completion_window='24h', created_at=1742482313, endpoint='/v1/chat/completions', input_file_id='file-MNQD39tDEDjDVaMjNBW8wi', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1742568713, failed_at=None, finalizing_at=None, in_progress_at=1742482314, metadata={'description': 'dev_augmented_ner_responses'}, output_file_id=None, request_counts=BatchRequestCounts(completed=131, failed=0, total=746))

In [None]:
client.batches.retrieve(test_batch_id)

Batch(id='batch_67dc2b89670c8190bbebaada409fb945', completion_window='24h', created_at=1742482313, endpoint='/v1/chat/completions', input_file_id='file-DLe5iMf1HJAJBPSPNeEK1x', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1742568713, failed_at=None, finalizing_at=None, in_progress_at=1742482314, metadata={'description': 'test_augmented_ner_responses'}, output_file_id=None, request_counts=BatchRequestCounts(completed=161, failed=0, total=500))

Load the files as below in the misc folder when OpenAI is complete
get_files JSON

In [None]:
path = "class/misc/"
def get_jsonl(path):
  with open(path, 'r', encoding='utf-8') as f:
      return [json.loads(line) for line in f]

train_output = get_jsonl(path + "batch_train_augmented_output.jsonl")
dev_output = get_jsonl(path + "batch_dev_augmented_output.jsonl")
test_output = get_jsonl(path + "batch_test_augmented_output.jsonl")

In [None]:
train_output[0]

{'id': 'batch_req_67dc378716208190a708417982fcd5a4',
 'custom_id': 'train0',
 'response': {'status_code': 200,
  'request_id': '3b47872434d61b1011aa49b682c45dfc',
  'body': {'id': 'chatcmpl-BDBXq8K2NVNAK0jbA42xcU4zuxlkw',
   'object': 'chat.completion',
   'created': 1742482342,
   'model': 'ft:gpt-4o-mini-2024-07-18:personal::AbTiRIUJ',
   'choices': [{'index': 0,
     'message': {'role': 'assistant',
      'content': 'Halted, he peered down the dark winding stairs and called out coarsely:   —Come up, @@Kinch##Person ! Come up, you fearful jesuit!   Solemnly he came forward and mounted the round gunrest. He faced about and blessed gravely thrice the tower, the surrounding land and the awaking mountains.',
      'refusal': None,
      'annotations': []},
     'logprobs': None,
     'finish_reason': 'stop'}],
   'usage': {'prompt_tokens': 453,
    'completion_tokens': 73,
    'total_tokens': 526,
    'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0},
    'completion_token

In [None]:
train_responses = [obj['response']['body']['choices'][0]['message']["content"] for obj in train_output]
dev_responses = [obj['response']['body']['choices'][0]['message']["content"] for obj in dev_output]
test_responses = [obj['response']['body']['choices'][0]['message']["content"] for obj in test_output]

In [None]:
df_train['ner_responses'] = train_responses
df_dev['ner_responses'] = dev_responses
df_test['ner_responses'] = test_responses

In [None]:
path = 'class/datasets/'
df_train.to_pickle(path + 'df_train_augmentation_ft')
df_dev.to_pickle(path + 'df_dev_augmentation_ft')
df_test.to_pickle(path + 'df_test_augmentation_ft')