# Text Classification - Finetune GPT-4o-mini


---

---

## $\color{blue}{Sections:}$

* Preamble
1.   Admin
2.   Data
4.   Prompt
5.   JSONL
6.   Check Datasets
7. Create OpenAI Finetuned Model

## $\color{blue}{Preamble:}$

Uploading dataset to OpenAI Finetuning GPT-4o-mini.

## $\color{blue}{Admin}$
* Install relevant Libraries
* Import relevant Libraries

In [None]:
%%capture
!pip install tiktoken openai cohere

In [None]:
pip install dill

Collecting dill
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.9-py3-none-any.whl (119 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m112.6/119.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.9


In [None]:
import openai
import re
import pandas as pd
import requests
import json
from google.colab import drive
from google.colab import userdata
from collections import defaultdict
import os
import dill

## $\color{blue}{Data}$

* Connect to Drive
* Load the data to a string

In [None]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive'

Mounted at /content/drive
/content/drive/MyDrive


In [None]:
import pandas as pd
path = "class/datasets/" # modify path
df_train = pd.read_pickle(path + "df_train")
df_dev = pd.read_pickle(path + "df_dev")
df_test = pd.read_pickle(path + "df_test")

# $\color{blue}{JSONL}$

----

The API requires data to be uploaded in this format.
The payload requires a system message (definition of LLM role), a user message (input prompt), and an assistant messages (expected output).

In [None]:
system_message = """
You are required to complete the NLP task of text classification.
You must provide a single word response from one of the possible categories.
You will provide a one and only one response which must be from the given categories in the categories list.""".strip()

prompt = """####Task:
The task is to predict the the correct book from the categories below given a short input of text.
Telemachia, Odyssey, Nostros are from James Joyce's Ulysses, and Dubliners was also written by Joyce. Dracula is by Bram Stoker, and Republic is by Plato.
After reading the Input select a single response from the Categories.

####Categories:
Telemachia
Odyssey
Nostros
Dubliners
Dracula
Republic

###Input:
{}

###Classification
"""

def format_data(df):
  dataset = []
  for i in range(df.shape[0]):
    point = {"messages" : [{"role": "system" , "content" : system_message}]}
    point["messages"].append({"role": "user", "content": prompt.format(df.loc[i]['content'])})
    point["messages"].append({"role": "assistant", "content": df.loc[i]['book']})
    dataset.append(point)
  return dataset

def save_to_jsonl(dataset, file_path):
  """
  Convert dataset into jsonl.

  Parameters
  ----------
  dataset : list
      List of dicts containing datapoint information.
  filepath: str
      File path to save to.

  Returns
  -------
  None
  """
  with open(file_path,"w") as file:
    for data in dataset:
      json_line = json.dumps(data)
      file.write(json_line + '\n')


##### $\color{red}{To-File}$


In [None]:
train_dataset = format_data(df_train)
dev_dataset = format_data(df_dev)
save_to_jsonl(train_dataset, "class/datasets/train_openai_book_ft.jsonl")
save_to_jsonl(dev_dataset, "class/datasets/dev_openai_book_ft.jsonl")

In [None]:
train_dataset[0]

{'messages': [{'role': 'system',
   'content': 'You are required to complete the NLP task of text classification.\nYou must provide a single word response from one of the possible categories.\nYou will provide a one and only one response which must be from the given categories in the categories list.'},
  {'role': 'user',
   'content': "####Task:\nThe task is to predict the the correct book from the categories below given a short input of text. \nTelemachia, Odyssey, Nostros are from James Joyce's Ulysses, and Dubliners was also written by Joyce. Dracula is by Bram Stoker, and Republic is by Plato. \nAfter reading the Input select a single response from the Categories.\n\n####Categories:\nTelemachia\nOdyssey\nNostros\nDubliners\nDracula\nRepublic\n\n###Input: \n“Is it John of Tuam?”   “Are you sure of that now?” asked Mr Fogarty dubiously. “I thought it was some Italian or American.”   “John of Tuam,” repeated Mr Cunningham, “was the man.”   He drank and the other gentlemen followed hi

# $\color{blue}{Check - Datasets}$

In [None]:
# Get example
def message_check(file_path, ind):
  """
  Check message from jsonl file.

  Parameters
  ----------
  filepath : str
      Path to jsonl file.
  ind: int
      Required ind for checking.

  Returns
  -------
  None
  """
  # Load the dataset
  with open(file_path, 'r', encoding='utf-8') as f:
      dataset = [json.loads(line) for line in f]

  # Initial dataset stats
  print("Num examples:", len(dataset))
  print("First example:")
  for message in dataset[ind]["messages"]:
      print(message)

In [None]:
# Format error checks
def check_errors(file_path):
  """
  Check if there are any errors in file that will cause OpenAI training process to fail.

  Parameters
  ----------
  filepath : str
      Path to the json file.

  Returns
  -------
  None
  """
  with open(file_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

  format_errors = defaultdict(int)

  for ex in dataset:
      if not isinstance(ex, dict):
          format_errors["data_type"] += 1
          continue

      messages = ex.get("messages", None)
      if not messages:
          format_errors["missing_messages_list"] += 1
          continue

      for message in messages:
          if "role" not in message or "content" not in message:
              format_errors["message_missing_key"] += 1

          if any(k not in ("role", "content", "name", "function_call") for k in message):
              format_errors["message_unrecognized_key"] += 1

          if message.get("role", None) not in ("system", "user", "assistant", "function"):
              format_errors["unrecognized_role"] += 1

          content = message.get("content", None)
          function_call = message.get("function_call", None)

          if (not content and not function_call) or not isinstance(content, str):
              format_errors["missing_content"] += 1

      if not any(message.get("role", None) == "assistant" for message in messages):
          format_errors["example_missing_assistant_message"] += 1

  if format_errors:
      print("Found errors:")
      for k, v in format_errors.items():
          print(f"{k}: {v}")
  else:
      print("No errors found")

In [None]:
message_check("class/datasets/train_openai_book_ft.jsonl",10)

Num examples: 12000
First example:
{'role': 'system', 'content': 'You are required to complete the NLP task of text classification.\nYou must provide a single word response from one of the possible categories.\nYou will provide a one and only one response which must be from the given categories in the categories list.'}
{'role': 'user', 'content': "####Task:\nThe task is to predict the the correct book from the categories below given a short input of text. \nTelemachia, Odyssey, Nostros are from James Joyce's Ulysses, and Dubliners was also written by Joyce. Dracula is by Bram Stoker, and Republic is by Plato. \nAfter reading the Input select a single response from the Categories.\n\n####Categories:\nTelemachia\nOdyssey\nNostros\nDubliners\nDracula\nRepublic\n\n###Input: \nMight be still up. Call to the hospital to see. Hope she’s over. Long day I’ve had. Martha, the bath, funeral, house of Keyes, museum with those goddesses, Dedalus’ song. Then that bawler in Barney Kiernan’s. Got my 

In [None]:
check_errors("class/datasets/train_openai_book_ft.jsonl")

No errors found


In [None]:
check_errors("class/datasets/dev_openai_book_ft.jsonl")

No errors found


# $\color{blue}{Create-OpenAi-Finetuned-Model}$

##### $\color{red}{Load-File}$

In [None]:
endpoint = "https://api.openai.com/v1/files" # endpoint for files

key = userdata.get('OPENAI_API_KEY')

headers = {'Authorization': f"Bearer {key}"}

def upload_file(file_path, endpoint, headers):
  """
  Upload a file to the OpenAI file system.

  Parameters
  ----------
  filepath : str
      Path to the json file.
  endpoint : str
      Use 'https://api.openai.com/v1/files'.
  headers : dict
      Use {'Authorization': f"Bearer {key}"}.

  Returns
  -------
  response : json
      Response from OpenAI confirming details of the upload.
  """
  with open(file_path,'rb') as f:
    response = requests.post(endpoint, headers=headers, files={'file': f}, data={'purpose': 'fine-tune'})
  return response.json()

In [None]:
train_file_response = upload_file("class/datasets/train_openai_book_ft.jsonl", endpoint, headers)

In [None]:
train_file_response

{'object': 'file',
 'id': 'file-YGu1oQ3iKOLlrt2rldNnQsFk',
 'purpose': 'fine-tune',
 'filename': 'train_openai_book_ft.jsonl',
 'bytes': 13000841,
 'created_at': 1732275093,
 'status': 'processed',
 'status_details': None}

In [None]:
dev_file_response = upload_file("class/datasets/dev_openai_book_ft.jsonl", endpoint, headers)

In [None]:
dev_file_response

{'object': 'file',
 'id': 'file-TkDHbJgzX5JpvhyGUVEeXzvO',
 'purpose': 'fine-tune',
 'filename': 'dev_openai_book_ft.jsonl',
 'bytes': 1040084,
 'created_at': 1732275104,
 'status': 'processed',
 'status_details': None}

##### $\color{red}{Create-Models}$

In [None]:
URL = "https://api.openai.com/v1/fine_tuning/jobs" # endpoint


headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {key}"
}

In [None]:
payload = {
  "training_file": train_file_response['id'],
  "validation_file": dev_file_response['id'],
  "model": "gpt-4o-mini-2024-07-18"
}
finetune_response = requests.post(URL, json=payload, headers=headers)
finetune_meta = json.loads(finetune_response.content)

In [None]:
finetune_meta

{'object': 'fine_tuning.job',
 'id': 'ftjob-tInjl4yG1WbcwMgNQ4UkuKe4',
 'model': 'gpt-4o-mini-2024-07-18',
 'created_at': 1732275127,
 'finished_at': None,
 'fine_tuned_model': None,
 'organization_id': 'org-4bBdSgsciB8iKzeJ61GgVdXt',
 'result_files': [],
 'status': 'validating_files',
 'validation_file': 'file-TkDHbJgzX5JpvhyGUVEeXzvO',
 'training_file': 'file-YGu1oQ3iKOLlrt2rldNnQsFk',
 'hyperparameters': {'n_epochs': 'auto',
  'batch_size': 'auto',
  'learning_rate_multiplier': 'auto'},
 'trained_tokens': None,
 'error': {},
 'user_provided_suffix': None,
 'seed': 717072433,
 'estimated_finish': None,
 'integrations': []}