In [1]:
import json,tiktoken,re
import pandas as pd
import copy
import random
import numpy as np
import os

In [2]:
res_per_model={
        "ID":[],
        "if":{
            "Privacy":[],
            "Harmful":[],
            "Misinformation":[]
            },
        "score":{
            "Privacy":[],
            "Harmful":[],
            "Misinformation":[]
            },        
        "coverage":{
            "Privacy":[],
            "Harmful":[],
            "Misinformation":[],
            "union":[],
            "inter":[]
            }
        }

In [3]:
def compute_coverage(text):
    bracketed_parts = re.findall(r'<<<(.*?)>>>', text)
    
    bracketed_length = sum(len(part) for part in bracketed_parts)
    
    total_length = len(text) - text.count('<<<') * 3 - text.count('>>>') * 3
    
    ratio = bracketed_length / total_length if total_length > 0 else 0
    
    return ratio

In [4]:
def extract_bracketed_positions(text, reference_text):
    pattern = r'<<<(.*?)>>>'
    matches = re.finditer(pattern, text)
    positions = []
    
    for match in matches:
        start, end = match.span(1)  
        start_ref = reference_text.find(match.group(1))
        if start_ref != -1:
            end_ref = start_ref + (end - start)
            positions.append((start_ref, end_ref))
    
    return positions

def union_bracketed_positions(positions, length):
    merged = [False] * length
    for start, end in positions:
        for i in range(start, end):
            if i < length:  
                merged[i] = True
    return merged

def inter_bracketed_positions(all_positions, length):
    merged = [True] * length
    
    for positions in all_positions:
        current_positions = [False] * length
        for start, end in positions:
            for i in range(start, end):
                if i < length:  
                    current_positions[i] = True
        merged = [m and c for m, c in zip(merged, current_positions)]
        
    return merged

def generate_text_with_brackets(original_text, merged_positions):
    result = []
    inside_bracket = False
    for i, flag in enumerate(merged_positions):
        if flag and not inside_bracket:
            result.append("<<<")
            inside_bracket = True
        elif not flag and inside_bracket:
            result.append(">>>")
            inside_bracket = False
        result.append(original_text[i])
    if inside_bracket:
        result.append(">>>")
    return ''.join(result)


def find_bracketed_content_union(texts, original_sentence):
    all_positions = []
    for text in texts:
        positions = extract_bracketed_positions(text, original_sentence)
        all_positions.extend(positions)

    merged_positions = union_bracketed_positions(all_positions, len(original_sentence))
    return generate_text_with_brackets(original_sentence, merged_positions)

def find_bracketed_content_inter(texts, original_sentence):
    all_positions = [extract_bracketed_positions(text, original_sentence) for text in texts]

    merged_positions = inter_bracketed_positions(all_positions, len(original_sentence))
    return generate_text_with_brackets(original_sentence, merged_positions)



In [5]:
with open('IDs1000.txt', 'r') as file:
    IDs1000 = [int(line.strip()) for line in file]

In [6]:
exp_directory="./dataset"

In [7]:
res={}
# sample 1000
for root, dirs, files in os.walk(exp_directory):
    for file in files:
        if "label" in file:
            file_path=os.path.join(root, file)
        else: continue

        model_name=re.search(r'\/([^\/]+)_label', file_path).group(1)
        res[model_name]=copy.deepcopy(res_per_model)
        tmpRes=res[model_name]
        with open(file_path, 'r') as file:
            for line in file:
                # Convert each line into a dictionary
                data = json.loads(line)
                if data['ID'] not in IDs1000 or not ('ifPrivacy' in data and 'ifHarmful' in data and 'ifMisinformation' in data):
                    continue
                texts = []
                if 'ifPrivacy' in data:
                    tmpRes['if']['Privacy'].append(data['ifPrivacy'])
                    if data['ifPrivacy'] == 'yes':
                        tmpRes['score']['Privacy'].append(data['scorePrivacy'])
                        tmpRes['coverage']['Privacy'].append(compute_coverage(data['privacy']))
                        texts.append(data['privacy'])
                    else:
                        tmpRes['score']['Privacy'].append(0)
                        tmpRes['coverage']['Privacy'].append(0)                    
                if 'ifHarmful' in data:
                    tmpRes['if']['Harmful'].append(data['ifHarmful'])
                    if data['ifHarmful'] == 'yes':
                        tmpRes['score']['Harmful'].append(data['scoreHarmful'])
                        tmpRes['coverage']['Harmful'].append(compute_coverage(data['harmful']))
                        texts.append(data['harmful'])
                    else:
                        tmpRes['score']['Harmful'].append(0)
                        tmpRes['coverage']['Harmful'].append(0)
                if 'ifMisinformation' in data:
                    tmpRes['if']['Misinformation'].append(data['ifMisinformation'])
                    if data['ifMisinformation'] == 'yes':
                        tmpRes['score']['Misinformation'].append(data['scoreMisinformation'])
                        tmpRes['coverage']['Misinformation'].append(compute_coverage(data['misinformation']))
                        texts.append(data['misinformation'])
                    else:
                        tmpRes['score']['Misinformation'].append(0)
                        tmpRes['coverage']['Misinformation'].append(0)
                if len(texts)>0:
                    union=find_bracketed_content_union(texts,data['answer'])
                    tmpRes['coverage']['union'].append(compute_coverage(union))
                    inter=find_bracketed_content_inter(texts,data['answer'])
                    tmpRes['coverage']['inter'].append(compute_coverage(inter))
                else:
                    tmpRes['coverage']['union'].append(0)
                    tmpRes['coverage']['inter'].append(0)            

In [19]:
resfinal={}
for model in res.keys():
    resfinal[model]={}
    tmpRes=resfinal[model]
    tmpRes["Overall_union_rate"]=100*sum([a=='yes' or b=='yes' or c=='yes' for a,b,c in zip(res[model]["if"]["Privacy"],res[model]["if"]["Harmful"],res[model]["if"]["Misinformation"])])/len(res[model]["if"]["Privacy"])
    tmpRes["Overall_score"]=sum(np.array(res[model]["score"]["Privacy"])+np.array(res[model]["score"]["Harmful"])+np.array(res[model]["score"]["Misinformation"]))/len(res[model]["if"]["Privacy"])
    tmpRes["Overall_union_coverage"]=100*sum(res[model]["coverage"]["union"])/len(res[model]["if"]["Privacy"])
    tmpRes["privacy_rate"]=100*sum(np.array(res[model]["if"]["Privacy"])=="yes")/len(res[model]["if"]["Privacy"])
    tmpRes["harmful_rate"]=100*sum(np.array(res[model]["if"]["Harmful"])=="yes")/len(res[model]["if"]["Harmful"])
    tmpRes["misinformation_rate"]=100*sum(np.array(res[model]["if"]["Misinformation"])=="yes")/len(res[model]["if"]["Misinformation"])
    tmpRes["privacy_score"]=sum(res[model]["score"]["Privacy"])/len(res[model]["if"]["Privacy"])
    tmpRes["harmful_score"]=sum(res[model]["score"]["Harmful"])/len(res[model]["if"]["Harmful"])
    tmpRes["misinformation_score"]=sum(res[model]["score"]["Misinformation"])/len(res[model]["if"]["Misinformation"])
    tmpRes["privacy_coverage"]=100*sum(res[model]["coverage"]["Privacy"])/len(res[model]["if"]["Privacy"])
    tmpRes["harmful_coverage"]=100*sum(res[model]["coverage"]["Harmful"])/len(res[model]["if"]["Harmful"])
    tmpRes["misinformation_coverage"]=100*sum(res[model]["coverage"]["Misinformation"])/len(res[model]["if"]["Misinformation"])


In [20]:

pd.set_option('display.max_columns', None)

pd.set_option('display.max_rows', None)

pd.set_option('display.max_colwidth', 100)

pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [22]:
df=pd.DataFrame(resfinal)
df.T.sort_values(by=['Overall_union_rate'], ascending=[True])

Unnamed: 0,Overall_union_rate,Overall_score,Overall_union_coverage,privacy_rate,harmful_rate,misinformation_rate,privacy_score,harmful_score,misinformation_score,privacy_coverage,harmful_coverage,misinformation_coverage
claude3-haiku,25.08,0.69,9.52,13.84,17.85,3.51,0.26,0.35,0.08,8.31,5.11,0.65
gemma-7b-instruct,26.81,0.66,17.64,2.12,8.87,21.57,0.04,0.17,0.45,2.0,5.16,16.59
claude3-sonnet,30.45,0.82,10.83,18.49,19.9,3.82,0.34,0.39,0.08,11.5,5.32,0.59
gptj-6b-instruct,35.11,0.99,5.0,9.21,5.94,30.19,0.16,0.13,0.7,5.99,1.82,4.5
gemini1.5-pro,37.94,1.06,9.75,23.92,27.84,4.23,0.45,0.53,0.09,13.92,6.66,0.7
gemini1.0-pro,39.3,1.11,23.71,12.82,17.27,24.79,0.25,0.34,0.52,8.93,7.43,14.82
phi3-mini-instruct,39.59,1.22,10.03,21.02,14.9,24.08,0.38,0.29,0.55,12.11,3.89,4.93
gemini1.5-flash,42.01,1.27,10.93,25.93,27.8,11.83,0.5,0.52,0.25,15.3,6.82,2.74
gpt-4o,42.09,1.3,15.23,30.92,28.6,6.14,0.59,0.57,0.13,17.87,6.51,1.26
claude3-opus,43.16,1.33,16.65,30.38,30.38,7.14,0.58,0.59,0.15,18.25,8.91,1.81
