In [38]:
%reload_ext autoreload
%autoreload 2

from pprint import pprint
import csv
from copy import deepcopy
import json
import pathlib
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
import numpy as np
import fine_tune.annotation_utils as a_utils
import fine_tune.llm_utils as llm_utils
import fine_tune.message_utils as m_utils
from fine_tune.env import (
    BRAT_DATA_PATH,
)
from fine_tune import utils
import split_training_data as std
from job_preset import JOB_PRESET
import os

load_dotenv()


True

## Load training job

### Load from preset

In [41]:
job_desc = 'purpose_span-sent_entity-v2-d4'

job_preset = JOB_PRESET[job_desc]

data_entities = job_preset.load_data()
all_data = job_preset.as_training_data(data_entities)
training_set_indices, validation_set_indices = job_preset.training_data_splitter(data_entities)

# base_model = 'gpt-4o-mini-2024-07-18'
base_model = 'gpt-4o-2024-08-06'

suffix = job_desc

job_desc, len(data_entities), len(training_set_indices), len(validation_set_indices), base_model, all_data[10]

('purpose_span-sent_entity-v2-d4',
 1067,
 120,
 30,
 'gpt-4o-2024-08-06',
 {'messages': [{'role': 'system',
    'content': 'You are an annotation expert. You will be given a segment of a privacy policy of a web or mobile application, and will be asked to annotate purpose entities in it.\n\nIMPORTANT: Filtering Out General Phrase\nBefore annotating, carefully check each potential purpose entity. DO NOT annotate general phrases that do not provide specific purpose types.\nExamples of general phrases to omit include, but are not limited to:\n\n"other purposes"\n"purposes described in our policy"\n\nPurpose entities are phrases in segment text that refer to the purposes for which USER\'S PERSONAL DATA will be used, collected, processed, protected, shared with the third parties, etc. The purpose entity must be mentioned in one of the following context types:\n1. first-party-collection-use - the policy segment mentions collection, usage, or processing of this datum by the first party (the a

### Or, custom load

#### Load brat dataset

In [2]:

import dataclasses

from pybrat.parser import BratParser, Entity, Event, Example, Relation

# Initialize a parser.
brat = BratParser(error="ignore")
# brat = BratParser()
annotations = brat.parse(BRAT_DATA_PATH)


In [3]:
# data_entities = a_utils.get_data_entities_of_segments(annotations)
data_entities = a_utils.get_data_entities_of_sentences(annotations)

len(data_entities), data_entities[10]

(1067,
 {'segment': 'INTRODUCTION\n\nDomains Holdings Group Limited (the "Company"), offers a global network of content-rich web, mobile and social sites, including user acquisition and media buying (including paid search, display, mobile and social advertising), content optimization (including key word search, off-page and on-page optimization), and affiliate program, both in the web and mobile ecosystem (including through websites or mobile apps operated by Company; collectively, the "Services").\nThe Company is deeply committed to safeguard the privacy expectations of its end users ("User(s)", "you" or "your"). Accordingly, we have put in place this Privacy Policy, which outlines our data protection practices, including how we collect, use, disclose and protect your Personal Information, as well as your rights with respect to your Personal Information.\nIn this Privacy Policy, you can learn about:\nWHAT INFORMATION WE COLLECT?\nMINORS\nHOW WE USE THE COLLECTED INFORMATION?\nMARKETIN

#### Select training data

In [4]:
# training_set_indices, validation_set_indices = std.simple_split(data_entities)
training_set_indices, validation_set_indices = std.better_split(data_entities)

len(training_set_indices), len(validation_set_indices)

(40, 8)

#### Set parameters

In [92]:
# all_data = m_utils.as_training_data_for_data_span_of_segment(data_entities)
# all_data = m_utils.as_training_data_for_data_span_of_segment_1_1(data_entities)
# all_data = m_utils.as_training_data_for_data_span_of_sentence(data_entities)
# all_data = m_utils.as_training_data_for_data_span_of_sentence_1(data_entities)  # Haven't used here
all_data = m_utils.as_training_data_for_data_span_of_sentence_only(data_entities)
# all_data = m_utils.as_training_data_for_data_classification_of_segment(data_entities)
# all_data = m_utils.as_training_data_for_data_classification_of_segment_gradual(data_entities)

# base_model = 'gpt-4o-2024-08-06'
base_model = 'gpt-4o-mini-2024-07-18'
suffix = 'data_entity-sent_data-ver2'
# suffix = 'sent-data-entities-30-train-5-val-with-empty'

job_desc = suffix

all_data[0]

{'messages': [{'role': 'system',
   'content': 'You are an annotation expert. You will be given a segment of a privacy policy of a web or mobile application, and will be asked to annotate entities in it.\nPlease annotate the given sentence of privacy policy for data entities, and adhere to the following guidelines:\n\nIMPORTANT: Filtering Out General Phrases\nBefore annotating, carefully check each potential data entity. DO NOT annotate general phrases that do not provide specific data types.\nExamples of general phrases to omit include, but are not limited to:\n\n"the information we collect about you"\n"other data"\n"any information"\n\nIf a phrase does not clearly indicate a specific type of personal data, DO NOT include it in your annotations.\n\nData entities are refers to the phrases that mention PERSONAL DATA OF THE USER which is being mentioned in one of the following context types:\n1. first-party-collection-use - the policy segment mentions collection, usage, or processing of 

## Fine-tune model with specified training data

In [42]:
job_desc_dir, fine_tune_job, test_set = llm_utils.fine_tune_with_data(
    all_data, training_set_indices, validation_set_indices,
    basemodel=base_model,
    fine_tune_args={'suffix':suffix},
    desc=job_desc)

job_desc_dir, fine_tune_job, len(test_set), test_set[0]

Constructing training and validation data files (locally)...
Uploading training and validation data files...
Data files uploading (API calls returned).
Creating fine-tuning job...
Created fine-tuning job.
Job description saved.


(PosixPath('../out/fine_tune-2024-10-01-15-08-03-gpt-4o-2024-08-06'),
 FineTuningJob(id='ftjob-yyumApfpV4ITCmtd9fTc37Xl', created_at=1727791686, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-B2C2pNzAq4paAOvhIYdFJlSv', result_files=[], seed=1649472628, status='validating_files', trained_tokens=None, training_file='file-uepY3AjPsSu01K8fSmlqvpwk', validation_file='file-Jxbkh69ufyTbSKsrsenaXiqw', estimated_finish=None, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='renyuneyun-university-of-oxford', entity=None, name=None, tags=None, run_id='ftjob-yyumApfpV4ITCmtd9fTc37Xl'))], user_provided_suffix='purpose_span-sent_entity-v2-d4'),
 917,
 {'messages': [{'role': 'system',
    'content': 'You are an annotation expe

In [9]:
%debug

> [0;32m/home/ryey/workspaces/oxford/PP-DToU/fine-tune/src/llm_utils.py[0m(50)[0;36mfine_tune_with_data[0;34m()[0m
[0;32m     48 [0;31m    [0mfl[0m[0;34m.[0m[0msymlink_to[0m[0;34m([0m[0mjob_desc_dir[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     49 [0;31m[0;34m[0m[0m
[0m[0;32m---> 50 [0;31m    [0mtraining_set[0m [0;34m=[0m [0;34m[[0m[0mall_data[0m[0;34m[[0m[0mi[0m[0;34m][0m [0;32mfor[0m [0mi[0m [0;32min[0m [0mtraining_set_indices[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     51 [0;31m    [0mvalidation_set[0m [0;34m=[0m [0;34m[[0m[0mall_data[0m[0;34m[[0m[0mi[0m[0;34m][0m [0;32mfor[0m [0mi[0m [0;32min[0m [0mvalidation_set_indices[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     52 [0;31m    [0mtest_set[0m [0;34m=[0m [0;34m[[0m[0mall_data[0m[0;34m[[0m[0mi[0m[0;34m][0m [0;32mfor[0m [0mi[0m [0;32min[0m [0mset[0m[0;34m([0m[0mrange[0m[0;34m([0m[0mlen[0m[0;34m([0m

In [41]:
llm_utils.await_fine_tune_finish_and_clean_up(wait_for_job_completion=False)

FineTuningJob(id='ftjob-61c0FVgYkOfjZaMwRpOjNOzo', created_at=1726265855, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-2024-08-06:rui:30-train-5-val-with-empty-from-api:A790JXld', finished_at=1726266269, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-B2C2pNzAq4paAOvhIYdFJlSv', result_files=['file-1TTTIJ6t7DVXg51yYafkwd4J'], seed=1504396448, status='succeeded', trained_tokens=68019, training_file='file-iYtsamFpHaR3FyqGhHWNeMVm', validation_file='file-C1S6NC5e9VWZrwgmuFzkDxVR', estimated_finish=None, integrations=[], user_provided_suffix='30_train-5_val-with_empty-from_api')

In [31]:
fine_tuned_model_id = 'ft:gpt-4o-mini-2024-07-18:rui:30_train-5_val-content_only-from_api:A6KhgqGM'
fine_tuned_model_id = 'ft:gpt-4o-mini-2024-07-18:rui:30-train-5-val-content-only-from-api:A6cig7w6'