In [16]:
import json
import random
from functools import lru_cache

import openai
import tiktoken
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
def print_overview(values, name):
  print(f"\n #### Distribution of {name}:")
  print(f"min / max: {min(values)}, {max(values)}")
  print(f"mean / median: {np.mean(values)}, {np.median(values)}")
  print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [8]:
dataset = []
with open('data/train-3.jsonl') as jsonfile:
    data = json.loads(jsonfile.readline())
    dataset.append(data)

In [9]:
encoding = tiktoken.get_encoding("cl100k_base")
def from_message_num_tokens(messages, tokens_per_message=3, tokens_per_name=1):
  num_tokens = 0
  for message in messages:
    num_tokens += tokens_per_message
    for key, value in message.items():
      num_tokens += len(encoding.encode(value))
      if key=="name":
        num_tokens += tokens_per_name

  num_tokens +=3
  return num_tokens

def from_message_num_assistant_tokens(messages):
  num_tokens = 0
  for message in messages:
    if message["role"] == "assistant":
      num_tokens +=len(encoding.encode(message["content"]))

  return num_tokens

In [10]:
# tokens counts and warnings - from OpenAI cookbook

n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(from_message_num_tokens(messages))
    assistant_message_lens.append(from_message_num_assistant_tokens(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)

print_overview(n_messages, "num_messages_per_example")
print_overview(convo_lens, "num_total_tokens_per_example")

print_overview(assistant_message_lens, "num_assistant_tokens_per_example")

n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")


Num examples missing system message: 0
Num examples missing user message: 0

 #### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

 #### Distribution of num_total_tokens_per_example:
min / max: 299, 299
mean / median: 299.0, 299.0
p5 / p95: 299.0, 299.0

 #### Distribution of num_assistant_tokens_per_example:
min / max: 8, 8
mean / median: 8.0, 8.0
p5 / p95: 8.0, 8.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [11]:
# Pricing and default n_epochs estimate

MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print("See pricing page to estimate total costs")


Dataset has ~299 tokens that will be charged for during training
By default, you'll train for 25 epochs on this dataset
By default, you'll be charged for ~7475 tokens
See pricing page to estimate total costs


In [14]:
from openai import OpenAI

client = OpenAI(
    api_key="",
)

In [42]:


training_dataset_file_name = 'data/train.jsonl'

training_response = client.files.create(
    file=Path(training_dataset_file_name),
    purpose="fine-tune"
)

In [43]:
training_response

FileObject(id='file-k54BIkStiPr2Khg6sGV5Kgxn', bytes=1351520, created_at=1721969868, filename='train.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [44]:
training_file_id = training_response.id
training_file_id

'file-k54BIkStiPr2Khg6sGV5Kgxn'

In [45]:
validation_dataset_file_name="data/test.jsonl"
validation_response = client.files.create(
    file=Path(validation_dataset_file_name),
    purpose="fine-tune"
)

In [46]:
print(validation_response)
validation_file_id = validation_response.id

FileObject(id='file-88elaXqwYaUBarM9cLDOzZy2', bytes=207738, created_at=1721969874, filename='test.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [48]:
response = client.fine_tuning.jobs.create(
    model = "gpt-3.5-turbo",
    training_file = training_file_id,
    validation_file = validation_file_id,
    suffix="court-laws-ipc"
)

In [49]:
response

FineTuningJob(id='ftjob-2Jds92lM9vs4nbeeXbbsjJ31', created_at=1721969925, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-u6yUY9mn0ScUXenpviwSxtMf', result_files=[], seed=2035424682, status='validating_files', trained_tokens=None, training_file='file-k54BIkStiPr2Khg6sGV5Kgxn', validation_file='file-88elaXqwYaUBarM9cLDOzZy2', estimated_finish=None, integrations=[], user_provided_suffix='court-laws-ipc')

In [50]:
job_id = response.id
job_id

'ftjob-2Jds92lM9vs4nbeeXbbsjJ31'

In [52]:
client.fine_tuning.jobs.list(limit=5)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-2Jds92lM9vs4nbeeXbbsjJ31', created_at=1721969925, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-u6yUY9mn0ScUXenpviwSxtMf', result_files=[], seed=2035424682, status='running', trained_tokens=None, training_file='file-k54BIkStiPr2Khg6sGV5Kgxn', validation_file='file-88elaXqwYaUBarM9cLDOzZy2', estimated_finish=None, integrations=[], user_provided_suffix='court-laws-ipc'), FineTuningJob(id='ftjob-Cq00yFywXyHqvE6pgjljvSXx', created_at=1721969885, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-u6yUY9mn0ScUXenpviwSxtMf', result_

In [53]:
 client.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-2Jds92lM9vs4nbeeXbbsjJ31', created_at=1721969925, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-u6yUY9mn0ScUXenpviwSxtMf', result_files=[], seed=2035424682, status='running', trained_tokens=None, training_file='file-k54BIkStiPr2Khg6sGV5Kgxn', validation_file='file-88elaXqwYaUBarM9cLDOzZy2', estimated_finish=None, integrations=[], user_provided_suffix='court-laws-ipc')

In [58]:
job_response = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10)
events = job_response.data
events

[FineTuningJobEvent(id='ftevent-6ClWlnp5d9LzBCK2rYljvxyp', created_at=1721969937, level='info', message='Fine-tuning job started', object='fine_tuning.job.event', data=None, type='message'),
 FineTuningJobEvent(id='ftevent-LN7Y5aqdyUOWA36r1di2KbuP', created_at=1721969929, level='info', message='Files validated, moving job to queued state', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-PBpsmcjvao7VWxF5oUr5n0VS', created_at=1721969925, level='info', message='Validating training file: file-k54BIkStiPr2Khg6sGV5Kgxn and validation file: file-88elaXqwYaUBarM9cLDOzZy2', object='fine_tuning.job.event', data={}, type='message'),
 FineTuningJobEvent(id='ftevent-EuOUz4RFlchJ5EjMrKdHZrNM', created_at=1721969925, level='info', message='Created fine-tuning job: ftjob-2Jds92lM9vs4nbeeXbbsjJ31', object='fine_tuning.job.event', data={}, type='message')]

In [90]:
@lru_cache()
def get_testing_file_data():
    test_data = []
    with open('../data/test.jsonl', 'r') as test_json:
        lines = test_json.readlines()    
        for i, line in enumerate(lines):
            test_data.append(json.loads(line))
    return test_data

def convert_raw_json_to_openai_format(raw_content):
    return dict({
        "system": "Process the following legal case details and provide the relevant sections of the Indian Penal Code.",
        "prompt": " ".join(raw_content["text"]),
        "expected": raw_content['labels']
    })
def get_random_line_from_testing_file():
    test_data = get_testing_file_data()
    index = random.randint(0, len(test_data))
    return convert_raw_json_to_openai_format(test_data[index])

In [91]:
random_sample = get_random_line_from_testing_file()

In [92]:
response = client.fine_tuning.jobs.retrieve(job_id)
response

FineTuningJob(id='ftjob-2Jds92lM9vs4nbeeXbbsjJ31', created_at=1721969925, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:dview-technologies-private-limited:court-laws-ipc:9p7jDn7d', finished_at=1721971461, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-u6yUY9mn0ScUXenpviwSxtMf', result_files=['file-0hc4vQGJUNryNvr5cc9a9Mia'], seed=2035424682, status='succeeded', trained_tokens=948966, training_file='file-k54BIkStiPr2Khg6sGV5Kgxn', validation_file='file-88elaXqwYaUBarM9cLDOzZy2', estimated_finish=None, integrations=[], user_provided_suffix='court-laws-ipc')

## Fine Tuning Second Time

In [93]:
fine_tune_model_id = response.fine_tuned_model
fine_tune_model_id

'ft:gpt-3.5-turbo-0125:dview-technologies-private-limited:court-laws-ipc:9p7jDn7d'

In [94]:
test_messages = []
test_messages.append({"role": "system", "content": random_sample["system"]})
test_messages.append({"role": "user", "content": random_sample["prompt"]})
print(test_messages)

[{'role': 'system', 'content': 'Process the following legal case details and provide the relevant sections of the Indian Penal Code.'}, {'role': 'user', 'content': 'P.W.1 was working as a driver in Metropolitan Transport Corporation, P.W.2 was working as a conductor in Bus Route No.12 B. On 08.01.2008 at about 4.30 P.M., while driving the bus near Ayyapanthangal all the accused came in a motor cycle and dashed against the bus. Thereafter, all the accused came inside the bus and A2/Appellant, attacked P.W.1 in his right chick and shoulder. P.W.2 came there to prevented them; A3 attacked him with the helmet and caused bleeding injuries. Thereafter, P.W.1 along with co-workers went to the police station and given a complaint. A2 in S.C.No.120 of 2009 on the file of the Additional & Sessions Judge \x16 Fast Track Court No.II, Poonamallee, is the appellant herein. Totally there are 3 accused and they stood charged for the offences under Sections 341, 294 B and 332 IPC. A3 separately stood c

In [95]:
response = client.chat.completions.create(
    model = fine_tune_model_id,
    messages = test_messages
)

print(response.choices[0].message.content)

Section 332 in The Indian Penal Code, Section 294(b) in The Indian Penal Code, Section 341 in The Indian Penal Code


In [98]:
",".join(random_sample["expected"])

'Section 294 in The Indian Penal Code,Section 332 in The Indian Penal Code,Section 341 in The Indian Penal Code'

In [99]:
training_dataset_file_name = 'data/train-2.jsonl'

training_response = client.files.create(
    file=Path(training_dataset_file_name),
    purpose="fine-tune"
)
training_file_id = training_response.id
training_file_id

'file-97Sxxz7ExRkViin1lH3HYxVJ'

In [100]:
validation_dataset_file_name="data/test-2.jsonl"
validation_response = client.files.create(
    file=Path(validation_dataset_file_name),
    purpose="fine-tune"
)
validation_file_id = validation_response.id

In [101]:
response = client.fine_tuning.jobs.create(
    model = fine_tune_model_id,
    training_file = training_file_id,
    validation_file = validation_file_id,
    suffix="court-laws-ipc-v2"
)
fine_tune_model_id = response.fine_tuned_model
fine_tune_model_id

In [102]:
print(fine_tune_model_id)

None


In [103]:
response

FineTuningJob(id='ftjob-l083Zot7gsrNgMycnkQJv5qD', created_at=1721981301, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='ft:gpt-3.5-turbo-0125:dview-technologies-private-limited:court-laws-ipc:9p7jDn7d', object='fine_tuning.job', organization_id='org-u6yUY9mn0ScUXenpviwSxtMf', result_files=[], seed=1557254246, status='validating_files', trained_tokens=None, training_file='file-97Sxxz7ExRkViin1lH3HYxVJ', validation_file='file-2pqJ9c4CZqSNqbyXzs4ubgs7', estimated_finish=None, integrations=[], user_provided_suffix='court-laws-ipc-v2')

## Fine Tuning Third Time

In [27]:
fine_tune_model_id = "ft:gpt-3.5-turbo-0125:dview-technologies-private-limited:court-laws-ipc-v2:9pAVgTwG"

In [17]:
training_dataset_file_name = 'data/train-3.jsonl'

training_response = client.files.create(
    file=Path(training_dataset_file_name),
    purpose="fine-tune"
)
training_file_id = training_response.id
training_file_id

'file-b8urY1PssYlrHIJxBBSQTNc0'

In [18]:
validation_dataset_file_name="data/test-3.jsonl"
validation_response = client.files.create(
    file=Path(validation_dataset_file_name),
    purpose="fine-tune"
)
validation_file_id = validation_response.id

In [28]:
response = client.fine_tuning.jobs.create(
    model = fine_tune_model_id,
    training_file = training_file_id,
    validation_file = validation_file_id,
    suffix="court-laws-ipc-v3"
)

In [24]:
response

FineTuningJob(id='ftjob-4jf0jElypvHUhoRVRgNwBrVN', created_at=1723016260, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='ft:gpt-3.5-turbo-0125:dview-technologies-private-limited:court-laws-ipc-v2:9pAVgTwG', object='fine_tuning.job', organization_id='org-u6yUY9mn0ScUXenpviwSxtMf', result_files=[], seed=367027819, status='validating_files', trained_tokens=None, training_file='file-b8urY1PssYlrHIJxBBSQTNc0', validation_file='file-QDP1zAZdAXlbVmcPmXlpvfRi', estimated_finish=None, integrations=[], user_provided_suffix='court-laws-ipc-v2')