In [1]:
%reload_ext autoreload
%autoreload 2

from pprint import pprint
import csv
from copy import deepcopy
import json
import pathlib
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
import src.annotation_utils as a_utils
import src.llm_utils as llm_utils
import src.stats_utils as s_utils
from src.env import (
    BRAT_DATA_PATH,
    F_DATA_CATEGORY_DEFINITION as DATA_DEF_FILE
)
import os

load_dotenv()


True

## Load saved queries (for model evaluation)

In [2]:
eval_dir = 'eval-2024-09-14-15-17-57-gpt-4o-mini-2024-07-18'
eval_dir = None
# eval_dir = '/home/ryey/workspaces/oxford/PP-DToU/fine-tune/out/exp/eval-2024-09-17-22-35-35-gpt-4o-2024-08-06'

desc, saved_queries = list(llm_utils.load_saved_llm_queries(eval_dir))
desc, len(saved_queries), saved_queries[0]

({'model': 'gpt-4o-2024-08-06', 'description': 'purpose_span-sent_entity'},
 1067,
 {'input': [{'role': 'system',
    'content': 'You are an annotation expert. You will be given a segment of a privacy policy of a web or mobile application, and will be asked to annotate purpose entities in it.\n\nIMPORTANT: Filtering Out General Phrase\nBefore annotating, carefully check each potential purpose entity. DO NOT annotate general phrases that do not provide specific purpose types.\nExamples of general phrases to omit include, but are not limited to:\n\n"other purposes"\n"purposes described in our policy"\n\nPurpose entities are phrases in segment text that refer to the purposes for which USER\'S PERSONAL DATA will be used, collected, processed, protected, shared with the third parties, etc. The purpose entity must be mentioned in one of the following context types:\n1. first-party-collection-use - the policy segment mentions collection, usage, processing, storage, retention, deletion, or pro

## Calculate statistics for model performance

In [7]:
s_utils.calc_and_print_statistics(desc, saved_queries, try_heuristic_parse=True, lcs_threshold=0.3)

Stat for eval with desc: {'model': 'gpt-4o-2024-08-06', 'description': 'purpose_span-sent_entity'}
  1049 valid datapoints, avg. precission, recall, f1: [0.80795824 0.81436205 0.80934799]
  20 (ought to be) non-empty datapoints, avg. precission, recall, f1: [0.42740944 0.76328958 0.50030217]
  1029 (ought to be) empty datapoints, avg. precission, recall, f1: [0.81535471 0.81535471 0.81535471]
  18 datapoints are not valid (e.g. not JSON; malformed model output)
  {109: ('```json\n'
       '[\n'
       '    {\n'
       '        "type": "first-party-collection-use",\n'
       '        "text": "providing you with promotional materials, concerning '
       'the Services as well as products, services, websites and applications '
       'which relate to: (i) other companies within the Company; or (ii) the '
       "Company's business partners and affiliates (collectively: "
       '\\"Marketing Affiliates\\"), which we believe may interest you"\n'
       '    },\n'
       '    {\n'
       ' 

In [4]:
# For query '2024-09-13-00-44-00-6cbf289f-798c-46ff-8d72-221b0da1099e'
# For model 'ft:gpt-4o-mini-2024-07-18:rui:30-train-5-val-content-only-from-api:A6cig7w6'
# print(np.mean(result_score_list, axis=0))
print(f"{len(result_score_list)} valid datapoints, avg. precission, recall, f1:", np.mean(result_score_list, axis=0))
print(f"{len(non_empty_result_score_list)} (ought to be) non-empty datapoints, avg. precission, recall, f1:", np.mean(non_empty_result_score_list, axis=0))
print(f"{len(empty_result_score_list)} (ought to be) empty datapoints, avg. precission, recall, f1:", np.mean(empty_result_score_list, axis=0))
print(f"{len(failed)} datapoints are not valid JSON")
failed

146 valid datapoints, avg. precission, recall, f1: [0.02842466 0.02796804 0.02663079]
10 (ought to be) non-empty datapoints, avg. precission, recall, f1: [0.415      0.40833333 0.38880952]
136 (ought to be) empty datapoints, avg. precission, recall, f1: [0. 0. 0.]
0 datapoints are not valid JSON


{}

In [30]:
# For query '2024-09-13-11-25-49-a393b3ea-6ae0-4433-8e92-610bd63e1305'
# For model 'ft:gpt-4o-mini-2024-07-18:rui:30-train-5-val-with-empty-from-api:A6o1jAxy'

print(f"{len(result_score_list)} valid datapoints, avg. precission, recall, f1:", np.mean(result_score_list, axis=0))
print(f"{len(non_empty_result_score_list)} (ought to be) non-empty datapoints, avg. precission, recall, f1:", np.mean(non_empty_result_score_list, axis=0))
print(f"{len(empty_result_score_list)} (ought to be) empty datapoints, avg. precission, recall, f1:", np.mean(empty_result_score_list, axis=0))
print(f"{len(failed)} datapoints are not valid JSON")
failed

144 valid datapoints, avg. precission, recall, f1: [0.78908855 0.80578704 0.78842856]
21 (ought to be) non-empty datapoints, avg. precission, recall, f1: [0.2680358  0.38253968 0.26351015]
123 (ought to be) empty datapoints, avg. precission, recall, f1: [0.87804878 0.87804878 0.87804878]
2 datapoints are not valid JSON


{54: ('[] reflections of persona data entities []', '[]'),
 67: ('["name": "Device ID", "type": "identification numbers associated with your device", "type": "personal information"]',
  '["geographical location data", "similar information regarding the location of your mobile device", "location data", "web request", "browser type", "referring / exit pages and URLs", "domain names", "landing pages"]')}

In [24]:
def compare_two_evaluations(eval1, eval2):
    desc1, saved_queries1 = eval1
    desc2, saved_queries2 = eval2

    def to_reserve_map(saved_queries):
        reverse_map = {}
        for query in saved_queries:
            key = (str(query['input'][1]), str(query['correct_output']))
            reverse_map[key] = query
        return reverse_map

    reverse_map1 = to_reserve_map(saved_queries1)
    reverse_map2 = to_reserve_map(saved_queries2)

    ## Calculate what keys are common, and what keys are unique to each
    common_keys = set(reverse_map1.keys()) & set(reverse_map2.keys())
    unique_keys1 = set(reverse_map1.keys()) - set(reverse_map2.keys())
    unique_keys2 = set(reverse_map2.keys()) - set(reverse_map1.keys())

    print(len(common_keys), len(unique_keys1), len(unique_keys2))

    # Return the common and unique queries
    common_queries1 = []
    common_queries2 = []
    unique_queries1 = []
    unique_queries2 = []
    for key in common_keys:
        common_queries1.append(reverse_map1[key])
        common_queries2.append(reverse_map2[key])
    for key in unique_keys1:
        unique_queries1.append(reverse_map1[key])
    for key in unique_keys2:
        unique_queries2.append(reverse_map2[key])

    return common_queries1, common_queries2, unique_queries1, unique_queries2


queries_to_load = [
    # '2024-09-13-00-44-00-6cbf289f-798c-46ff-8d72-221b0da1099e',
    '2024-09-13-11-25-49-a393b3ea-6ae0-4433-8e92-610bd63e1305',
    '2024-09-13-14-48-28-ft:gpt-4o-mini-2024-07-18:rui:30-train-5-val-no-empty-from-api:A70TvfoL',
]
eval1, eval2 = [llm_utils.load_saved_llm_queries(query) for query in queries_to_load[:2]]
common_queries1, common_queries2, unique_queries1, unique_queries2 = compare_two_evaluations(eval1, eval2)


calc_and_print_statistics(eval1[0], common_queries1)
calc_and_print_statistics(eval2[0], common_queries2)
calc_and_print_statistics(eval1[0], unique_queries1)
calc_and_print_statistics(eval2[0], unique_queries2)

134 12 12
Stat for eval with desc: {'model': 'ft:gpt-4o-mini-2024-07-18:rui:30-train-5-val-with-empty-from-api:A6o1jAxy'}
  133 valid datapoints, avg. precission, recall, f1: [0.81938179 0.8245614  0.82119585]
  10 (ought to be) non-empty datapoints, avg. precission, recall, f1: [0.09777778 0.16666667 0.12190476]
  123 (ought to be) empty datapoints, avg. precission, recall, f1: [0.87804878 0.87804878 0.87804878]
  1 datapoints are not valid JSON
  {53: ('[] reflections of persona data entities []', '[]')}
Stat for eval with desc: {'model': 'ft:gpt-4o-mini-2024-07-18:rui:30-train-5-val-no-empty-from-api:A70TvfoL'}
  134 valid datapoints, avg. precission, recall, f1: [0.02860697 0.03358209 0.03059701]
  10 (ought to be) non-empty datapoints, avg. precission, recall, f1: [0.38333333 0.45       0.41      ]
  124 (ought to be) empty datapoints, avg. precission, recall, f1: [0. 0. 0.]
  0 datapoints are not valid JSON
  {}
Stat for eval with desc: {'model': 'ft:gpt-4o-mini-2024-07-18:rui:30

In [22]:
%debug

> [0;32m/usr/lib/python3.12/json/decoder.py[0m(340)[0;36mdecode[0;34m()[0m
[0;32m    338 [0;31m        [0mend[0m [0;34m=[0m [0m_w[0m[0;34m([0m[0ms[0m[0;34m,[0m [0mend[0m[0;34m)[0m[0;34m.[0m[0mend[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    339 [0;31m        [0;32mif[0m [0mend[0m [0;34m!=[0m [0mlen[0m[0;34m([0m[0ms[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 340 [0;31m            [0;32mraise[0m [0mJSONDecodeError[0m[0;34m([0m[0;34m"Extra data"[0m[0;34m,[0m [0ms[0m[0;34m,[0m [0mend[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    341 [0;31m        [0;32mreturn[0m [0mobj[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    342 [0;31m[0;34m[0m[0m
[0m
'[] reflections of persona data entities []'
'[] reflections of persona data entities []'
'[] reflections of persona data entities []'
'[] reflections of persona data entities []'
