# Text Classification - Inference GPT-4o-mini Finetuned


## $\color{blue}{Sections:}$
* Preamble
* Admin - importing libraries
* Data - Load dataframes
* Payload
* Inference
* Score

## $\color{blue}{Preamble:}$

Connecting to OpenAI API and inference on with GPT-4o-mini Fineutned


## $\color{blue}{Admin:}$

In [None]:
from google.colab import drive
from google.colab import userdata

drive.mount("/content/drive")
%cd '/content/drive/MyDrive/'

Mounted at /content/drive
/content/drive/MyDrive


In [None]:
import json
import requests

## $\color{blue}{Data:}$

In [None]:
import pandas as pd
path = "class/datasets/" # modify path
df_train = pd.read_pickle(path + "df_train")
df_dev = pd.read_pickle(path + "df_dev")
df_test = pd.read_pickle(path + "df_test")

In [None]:
df_dev.columns

Index(['index', 'master', 'book_idx', 'book', 'chapter_idx', 'chapter',
       'author', 'content', 'vanilla_embedding', 'vanilla_preds',
       'vanilla_pseudo_book', 'vanilla_moe_e2e_soft_preds',
       'vanilla_moe_e2e_soft_pseudo_book', 'vanilla_moe_e2e_hard_preds',
       'vanilla_moe_e2e_hard_pseudo_book', 'vanilla_moe_e2e_soft_forest_preds',
       'vanilla_moe_e2e_soft_forest_pseudo_book', 'vanilla_moe_hard_pre_preds',
       'vanilla_moe_hard_pre_pseudo_book', 'vanilla_embedding.1',
       'direct_ft_preds', 'direct_ft_pseudo_book', 'ft_embedding',
       'embedding_ft_preds', 'embedding_ft_pseudo_book', 'direct_ft_moe_preds',
       'direct_ft_moe_pseudo_book', 'ft_embedding_pal', 'mistral_ots_book',
       'mistral_ft_book'],
      dtype='object')

In [None]:
df_dev["gpt_4o_mini_ft_book"] = [""] * df_dev.shape[0]

## $\color{blue}{Payload:}$

In [None]:
URL = "https://api.openai.com/v1/chat/completions" # endpoint

system_message = """
You are required to complete the NLP task of text classification.
You must provide a single word response from one of the possible categories.
You will provide a one and only one response which must be from the given categories.""".strip()

key = userdata.get('OPENAI_API_KEY')
# model = "ft:gpt-4o-mini-2024-07-18:personal::AWMnZPNs:ckpt-step-750"
model = "ft:gpt-4o-mini-2024-07-18:personal::AWMnZPNs:ckpt-step-750"
payload = {
"model": model,
"messages": [{"role": "system", "content": system_message}],
"temperature" : 0, # creativity of the model
"top_p":1.0, # percentile probability sampling
"n" : 1, # number of responses to generate
"stream": False,
"presence_penalty":0, # penalize/ incentivize given tokens
"frequency_penalty":0, # penalize/ incentivize given tokens
}

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {key}"
}

## $\color{blue}{Inference:}$

In [None]:
# parse the response json
def get_predicted(response):
  """Get content of the response from OpenAI"""
  out = response.content
  out_dict = json.loads(out)
  return out_dict['choices'][0]['message']['content']

In [None]:
prompt = """####Task:
The task is to predict the the correct book from the categories below given a short input of text.
Telemachia, Odyssey, Nostros, and Dubliners were written by Joyce. Dracula is by Bram Stoker, and Republic is by Plato.
After reading the Input select a single response from the Categories.

####Categories:
Telemachia
Odyssey
Nostros
Dubliners
Dracula
Republic

###Input:
{}

###Classification
"""


In [None]:
print(prompt.format("dog"))

####Task:
The task is to predict the the correct book from the categories below given a short input of text.
Telemachia, Odyssey, Nostros, and Dubliners were written by Joyce. Dracula is by Bram Stoker, and Republic is by Plato.
After reading the Input select a single response from the Categories.

####Categories:
Telemachia
Odyssey
Nostros
Dubliners
Dracula
Republic

###Input:
dog

###Classification



In [None]:
responses = [""] * df_dev.shape[0]
count = 0
n = df_dev.shape[0]
for i in range(n):
  if count % 20 == 0:
    print(count)
  payload['messages'] = [{"role": "system", "content": system_message}] # reset payload
  new_prompt = prompt.format(df_dev.loc[i]["content"]) # make prompt
  payload['messages'].append({'role':'user', 'content': new_prompt}) # add prompt to payload
  try:
    response = requests.post(URL, headers=headers, json=payload, stream=False, timeout=80) # send request
    responses[i] = get_predicted(response) # extract content
  except:
    responses[i] = "fail"
    print(f"fail")
  count += 1


0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960


In [None]:
response

<Response [200]>

## $\color{blue}{Score:}$

In [None]:
conv = {
    None: "unknown",
    "Telemachia": 0,
    "Odyssey":1,
    "Nostros":2,
    "Dubliners":3,
    "Dracula":4,
    "Republic":5,
}


In [None]:
from collections import Counter
print(Counter(responses))

Counter({'Odyssey': 289, 'Dracula': 233, 'Nostos': 201, 'Republic': 141, 'Dubliners': 77, 'Telemachia': 23})


In [None]:
df_dev["gpt_4o_mini_book_checkpoint"] = responses
df_dev["gpt_4o_mini_book_checkpoint"] = df_dev["gpt_4o_mini_book_checkpoint"].map(conv)

In [None]:
(df_dev["gpt_4o_mini_book_checkpoint"] == df_dev["book_idx"]).sum()/ df_dev.shape[0]

0.7531120331950207

In [None]:
df_dev.to_pickle(path + "df_dev")

In [None]:
responses

['Odyssey',
 'Odyssey',
 'Odyssey',
 'Odyssey',
 'Dubliners',
 'Dracula',
 'Dracula',
 'Odyssey',
 'Telemachia',
 'Odyssey',
 'Dracula',
 'Republic',
 'Dubliners',
 'Nostos',
 'Republic',
 'Dubliners',
 'Odyssey',
 'Nostos',
 'Nostos',
 'Odyssey',
 'Nostos',
 'Republic',
 'Dracula',
 'Dubliners',
 'Republic',
 'Republic',
 'Dracula',
 'Odyssey',
 'Odyssey',
 'Nostos',
 'Telemachia',
 'Odyssey',
 'Dracula',
 'Dracula',
 'Odyssey',
 'Republic',
 'Dracula',
 'Dubliners',
 'Nostos',
 'Nostos',
 'Republic',
 'Dracula',
 'Dracula',
 'Odyssey',
 'Republic',
 'Republic',
 'Dracula',
 'Nostos',
 'Dracula',
 'Republic',
 'Dracula',
 'Republic',
 'Republic',
 'Dracula',
 'Odyssey',
 'Odyssey',
 'Odyssey',
 'Republic',
 'Odyssey',
 'Dracula',
 'Dracula',
 'Dracula',
 'Dracula',
 'Odyssey',
 'Odyssey',
 'Nostos',
 'Odyssey',
 'Dracula',
 'Republic',
 'Nostos',
 'Telemachia',
 'Odyssey',
 'Dracula',
 'Republic',
 'Odyssey',
 'Republic',
 'Odyssey',
 'Telemachia',
 'Odyssey',
 'Nostos',
 'Odyssey',
 