In [2]:

%reload_ext autoreload
%autoreload 2

from pprint import pprint
import csv
from copy import deepcopy
import json
import pathlib
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
import fine_tune.annotation_utils as a_utils
import fine_tune.llm_utils as llm_utils
import fine_tune.message_utils as m_utils
from fine_tune.env import (
    BRAT_DATA_PATH,
)
from job_preset import JOB_PRESET
import os

load_dotenv()


True

## Load evaluation dataset

### Load from preset

In [3]:
# job_desc_dir = None
job_desc_dir = 'fine_tune-2024-09-22-23-42-28-gpt-4o-mini-2024-07-18'

job_desc = llm_utils.load_fine_tune_description(job_desc_dir)

job_preset = JOB_PRESET[job_desc]

data_entities = job_preset.load_data()
all_data = job_preset.as_training_data(data_entities)
training_set, validation_set, test_set, fine_tuned_model_id = llm_utils.load_eval_info(job_desc_dir, all_data=all_data)

job_desc, fine_tuned_model_id, len(test_set), test_set[0]

('action-seg-v2-d3',
 'ft:gpt-4o-mini-2024-07-18:rui:action-seg-v2-d3:AAPffHbH',
 133,
 {'messages': [{'role': 'system',
    'content': 'You are an annotation expert. You will be given a segment of a privacy policy of a web or mobile application, and will be asked to annotate actions in it.\n\nIMPORTANT: Filtering Out General Phrases\nBefore annotating, carefully check each potential data entity. DO NOT annotate sentences that do not provide specific data types or purpose types.\nExamples of general phrases to omit include, but are not limited to:\n\n"the information we collect about you"\n"other data"\n"any information"\n"other purposes"\n"purposes described in our policy"\n\nIf a sentence does not clearly indicate a specific type of personal data, DO NOT include it in your annotations.\n\nData usage context refers to the (core phrases within) sentences that mention the actions that are being taken with the PERSONAL DATA OF THE USER which is being mentioned in one of the following con

### Or, load from non-preset

In [36]:
## For (segment, data_span)
# data_entities = a_utils.load_data_entities_of_segments()

# data_entities = [segment for segment in data_entities if segment['entities']]

# For data span
# all_data = m_utils.as_training_data_for_data_span_of_segment(data_entities)
# all_data = m_utils.as_training_data_for_data_span_of_segment_1_1(data_entities)
# For data classification
# all_data = m_utils.as_training_data_for_data_classification_of_segment(data_entities)
# For data classification (gradual, level 0)
# all_data = m_utils.as_training_data_for_data_classification_of_segment_gradual(data_entities)

## For (segment, sentence, data_span)
# data_entities = a_utils.load_data_entities_of_sentences()
# For data span of sentence
# all_data = m_utils.as_training_data_for_data_span_of_sentence(data_entities)  # data_entity-seg_sent_data-v2
# all_data = m_utils.as_training_data_for_data_span_of_sentence_1(data_entities)
# all_data = m_utils.as_training_data_for_data_span_of_sentence_only(data_entities)  # data_entity-sent_data-ver2


## For (sentence, purpose_span)
# purpose_entities = a_utils.load_purpose_entities_of_sentences()
# For purpose span of sentence
# all_data = m_utils.as_training_data_for_purpose_span_of_sentence_only(purpose_entities)


## For (segment, (sentence, action_type, text))
# action_entities = a_utils.load_actions_of_segments()
# For action type of sentences
# all_data = m_utils.as_training_data_for_action_span_for_segment(action_entities)

## For (segment, sentence, action_type, text)
action_entities = a_utils.load_and_get(a_utils.get_actions_of_sentences)
# For action type of sentence
all_data = m_utils.as_training_data_for_action_span_of_sentence_only(action_entities)


job_desc = 'action-sent'

len(all_data)

1067

#### Load from a previous fine-tune job

In [50]:
# job_desc_dir = None
job_desc_dir = 'fine_tune-2024-09-18-19-35-56-gpt-4o-mini-2024-07-18'

training_set, validation_set, test_set, fine_tuned_model_id = llm_utils.load_eval_info(job_desc_dir, all_data=all_data)

len(training_set), len(validation_set), len(test_set)

(40, 8, 1019)

#### Or, Use the entire dataset

In [37]:

test_set = all_data

len(test_set), test_set[0]

(1067,
 {'messages': [{'role': 'system',
    'content': 'You are an annotation expert. You will be given a segment of a privacy policy of a web or mobile application, and will be asked to annotate actions in it.\n\nIMPORTANT: Filtering Out General Phrases\nBefore annotating, carefully check each potential data entity. DO NOT annotate sentences that do not provide specific data types or purpose types.\nExamples of general phrases to omit include, but are not limited to:\n\n"the information we collect about you"\n"other data"\n"any information"\n"other purposes"\n"purposes described in our policy"\n\nIf a sentence does not clearly indicate a specific type of personal data, DO NOT include it in your annotations.\n\nData usage context refers to the (core phrases within) sentences that mention the actions that are being taken with the PERSONAL DATA OF THE USER which is being mentioned in one of the following context types:\n1. first-party-collection-use - the policy segment mentions collect

## Run model evaluation

In [4]:

model_id = fine_tuned_model_id if 'fine_tuned_model_id' in locals() else 'gpt-4o-mini-2024-07-18'
# model_id = '4.0Ultra'
# model_id = 'gpt-4o-2024-08-06'
# model_id = 'ft:gpt-4o-2024-08-06:rui:test:A8rFT3EN'

desc = job_desc if 'job_desc' in locals() else None

# test_set = test_set[284:]

model_id, desc, len(test_set)

('ft:gpt-4o-mini-2024-07-18:rui:action-seg-v2-d3:AAPffHbH',
 'action-seg-v2-d3',
 133)

In [5]:
messages_list = [data['messages'][:-1] for data in test_set]
correct_outputs = [data['messages'][-1]['content'] for data in test_set]

dir_name, obj_model_outputs = llm_utils.query_llm(model_id, messages_list, correct_outputs=correct_outputs,
                                                  batch=False,
                                                  desc=desc)
dir_name, len(obj_model_outputs)

# Not using batch for some tasks because of rate limit
# dir_name, batch_job = llm_utils.query_llm(model_id, messages_list, correct_outputs=correct_outputs,
#                                                   batch=True,
#                                                   desc=desc)
# dir_name, batch_job

100%|██████████| 133/133 [01:12<00:00,  1.85it/s]


('eval-2024-09-23-21-51-37-ft:gpt-4o-mini-2024-07-18:rui:action-seg-v2-d3:AAPffHbH',
 133)

In [37]:
llm_utils.wait_for_batch_job_finish(batch_job.id)
# llm_utils.retrieve_batch_query_result()
# llm_utils.combine_batch_query_result()

Batch(id='batch_JqFQSt6gdH8BBa7BHgVYpeFH', completion_window='24h', created_at=1726578398, endpoint='/v1/chat/completions', input_file_id='file-bWlE4mEreTE1ns9Va1l1Gr69', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-2024-08-06 in organization org-B2C2pNzAq4paAOvhIYdFJlSv. Limit: 90,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list'), expired_at=None, expires_at=1726664798, failed_at=1726578399, finalizing_at=None, in_progress_at=None, metadata={'description': 'data_span-seg_entity-ver2'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))