In [2]:
import pandas as pd
import re, string
import numpy as np
import math
from tqdm import tqdm 
from experiments.RQ1.utils import get_chatgpt_response, get_atomic_facts_gpt

In [3]:
# get_chatgpt_response('hello')

In [13]:
model_filter = {
    'FacEval': ['bart_large', 'co-ref bart large', 'condigsum bart large','gpt4-32k-0613','mv-bart_large', 'alpaca-13b'],
    'SAMSum': ['BART', 'CODS', 'MV-BART', 'UniLM', 'gpt4-32k-0613', 'alpaca-13b'],
    'DialogueSum': ['BART', 'CODS', 'MV-BART', 'UniLM', 'gpt4-32k-0613', 'alpaca-13b']

}
def read_filter(filename):
    df = pd.read_csv(filename)
    print(len(df))
    df_datasets = []
    unique_datasets = list(set(df['origin'].values))
    # print(unique_datasets)
    for dataset in unique_datasets: 
        df_origin = df[df['origin'] == dataset]
        df_origin = df_origin[df_origin['model'].isin(model_filter[dataset])]
        # print(len(df_origin))
        unique_docids = list(set(df_origin['docid'].values))
        #### test ###
        num_models = []
        
        for udocid in unique_docids:
            df_docid = df_origin[df_origin['docid'] == udocid]
            num_models.append(len(list(set(df_docid['model'].values))))
        # print(df_docid)
        
        assert(len(set(num_models)) == 1) 
        df_datasets.append(df_origin)
    df_filtered = pd.concat(df_datasets)
    assert(len(df_filtered) <= len(df))
    return df_filtered


In [14]:
num_samples = 4
def make_incontext_examples(df, num_samples):
    each_sample = int(num_samples/2)
    num_errors = []
    print('Before sampling', len(df))
    for nonfactual_spans in list(df['nonfactual_spans'].values):
        nonfactual_spans = eval(nonfactual_spans)
        num_errors += [len(nonfactual_spans)]
    df['Errors'] = num_errors
    df_fewshot_spans_one = df[df['Errors'] == 1].sample(each_sample )
    df_fewshot_spans_two = df[df['Errors'] > 1].sample(each_sample )
    df_fewshot_nospans = df[df['Errors'] == 0].sample(1)
    df_fewshot =pd.concat([df_fewshot_spans_one, df_fewshot_spans_two, df_fewshot_nospans])
    df_fewshot = df_fewshot.sample(len(df_fewshot))
    df_fewshot.to_csv('fewshot_examples.csv')

    for idx, row in df_fewshot.iterrows():
        df = df.drop(idx)
    print('After sampling', len(df))
    return df

def make_fewshot_str(df_fewshot, instruction_template, prompt_template, source_type = 'Dialogue'):
    fewshot_prompt_strs = []
    for idx, row in df_fewshot.iterrows():
        if source_type == 'Dialogue':
            source = row['dialogue']
        else:
            source = row['dialogue_atomic_facts']
        summ = row['summary']
        inconsistent_spans = eval(row['nonfactual_spans'])
        inconsistent_spans = '\n'.join(inconsistent_spans)
        inconsistent_spans = inconsistent_spans if inconsistent_spans else "None"
        instruction = ''
        if idx == list(df_fewshot.index.values)[0]:
            instruction = instruction_template.format(source_type = source_type)
        
        prompt = prompt_template.format(instruction = instruction,
                                        source_type = source_type,
                                        source = source,
                                        summ = summ)
        prompt += ' '+ inconsistent_spans
        fewshot_prompt_strs.append(prompt)
    return '\n'.join(fewshot_prompt_strs)

In [15]:
import spacy
from thefuzz import fuzz
def postprocess(text):
    text = text.strip(string.punctuation).lower()
    return text
    

def calculate_f1(matched, pred_spans, annotated_spans):
    found_pred_spans = set([each[0] for each in matched])
    found_rec_spans = set([each[1] for each in matched])
    
    precision = len(found_pred_spans)/len(pred_spans)
    recall  = len(found_rec_spans)/len(annotated_spans)
    
    if precision + recall > 0:
        f1_score = (2 * precision * recall)/(precision + recall)
    else:
        f1_score = 0
    return f1_score
    
def get_f1_scores(all_pred_spans, nonfactual_spans):
    
    matched_pred_fuzzy = []
    matched_pred_exact = []
    for pred_span in all_pred_spans:
        for ref_span in nonfactual_spans:
            fuzzy_score = fuzz.partial_ratio(pred_span, ref_span)
            # print(pred_span, ref_span, fuzzy_score)
            if fuzzy_score > 80:
                matched_pred_fuzzy.append((pred_span, ref_span))
            if pred_span == ref_span:
                matched_pred_exact.append((pred_span, ref_span))
            
    matched_pred_fuzzy = list(set(matched_pred_fuzzy))
    matched_pred_exact = list(set(matched_pred_exact))
    # print('MATCHED LEN AND STRICT', matched_pred_fuzzy, matched_pred_exact)
    # print('PRED, ANN', all_pred_spans, nonfactual_spans)
    
    f1_score_lenient = calculate_f1(matched_pred_fuzzy, all_pred_spans, nonfactual_spans)
    f1_score_strict = calculate_f1(matched_pred_exact, all_pred_spans, nonfactual_spans)
    return f1_score_lenient, f1_score_strict
    
def gpt_text_inconsistent(gpt_response):
    inconsistent_phrases = ['no inconsisten', 'none', '[]', 'is consistent']
    if not gpt_response.strip():
        return True
    
    elif [each  for each in inconsistent_phrases if each in gpt_response.lower()]:
        return True
    return False


def get_gpt_span_scores(prompt, nonfactual_spans):
    nlp = spacy.load('en_core_web_sm')
    row_f1_scores = []
    row_acc_scores = []
    
    gpt_response = get_chatgpt_response(prompt)
    all_pred_spans = gpt_response.split('\n')
    
    if gpt_text_inconsistent(gpt_response):
        if not nonfactual_spans:
            f1_score_lenient = 1
            f1_score_strict = 1
        else:
            f1_score_lenient = 0
            f1_score_strict = 0
            
    else:
        if not nonfactual_spans:
            f1_score_lenient = 0
            f1_score_strict = 0
        else:
            f1_score_lenient, f1_score_strict = get_f1_scores(all_pred_spans, nonfactual_spans)
            
    return f1_score_lenient, f1_score_strict, all_pred_spans

In [19]:
from tqdm import tqdm
def get_zero_shot_scores(df, source_type = 'Dialogue'):
    span_f1_lenient_scores = []
    span_f1_strict_scores = []
    gpt_responses = []
    
    # for idx, row in tqdm(df.iterrows(), total = df.shape[0]):
    row_idx = 0
    # for idx, row in tqdm(df.iterrows(), total = df.shape[0]):
    for idx, row in df.iterrows():
        if source_type == 'Dialogue':
            source = row['dialogue']
        else:
            source = row['dialogue_atomic_facts']

        summ = row['summary']
        
        nonfactual_spans = eval(row['nonfactual_spans'])
        # print('PROMPT', prompt_dlg)
        # print('ANNOTATED', nonfactual_spans)
        
        instruction = instruction_template.format(source_type = source_type)
        
        prompt = prompt_template.format(instruction = instruction,
                                        source_type = source_type,
                                        source = source,
                                        summ = summ) 
            
        
        span_f1_lenient, span_f1_strict, gpt_response= get_gpt_span_scores(prompt, nonfactual_spans)
        print(row_idx)
        if row_idx % row_idx == 0:
            
            print(prompt)
            print('PRED', gpt_response)
            print('ANN', nonfactual_spans)
            print(span_f1_lenient, span_f1_strict)
            print('***')
        row_idx += 1 
        span_f1_lenient_scores.append(span_f1_lenient)
        span_f1_strict_scores.append(span_f1_strict)
        gpt_responses.append(gpt_response)
    return span_f1_lenient_scores, span_f1_strict_scores, gpt_responses

In [18]:
read_path = '/home/ramprasad.sa/factual_evaluation_source_based/annotations/xformer_llm_annotated.csv'
df_filtered = read_filter(read_path)
df_filtered = df_filtered[df_filtered['origin'] != 'FacEval']
df_filtered

2034


Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,dialogue_atomic_facts
40,40,13681241,MV-BART,[],[],Callum is still busy and sorry.,0,[],Jair: Still busy?\r\nCallum: Yes a little sorr...,SAMSum,Jair asks if Callum is still busy.\nCallum con...
41,41,13681241,CODS,[],[],Callum is busy.,0,[],Jair: Still busy?\r\nCallum: Yes a little sorr...,SAMSum,Jair asks if Callum is still busy.\nCallum con...
42,42,13681241,UniLM,[],[],Callum is still busy .,0,[],Jair: Still busy?\r\nCallum: Yes a little sorr...,SAMSum,Jair asks if Callum is still busy.\nCallum con...
43,43,13681241,BART,[],[],Callum is busy.,0,[],Jair: Still busy?\r\nCallum: Yes a little sorr...,SAMSum,Jair asks if Callum is still busy.\nCallum con...
44,44,13681241,gpt4-32k-0613,[],[],"Jair checks if Callum is still busy, and Callu...",0,[],Jair: Still busy?\r\nCallum: Yes a little sorr...,SAMSum,Jair asks if Callum is still busy.\nCallum con...
...,...,...,...,...,...,...,...,...,...,...,...
2023,2023,test_339,CODS,"[""and #Person1#'s mom "", '#Person1# and #Perso...",[],#Person1# and #Person2# are talking about what...,1,"['Intrinsic_Error', 'Intrinsic_Error']","#Person1#: Okay, next question. If Eric asked ...",DialogueSum,Person1 is asking the next question.\nPerson1 ...
2024,2024,test_339,UniLM,['person1 and person2 '],[],person1 and person2 are asking each other abou...,1,['Intrinsic_Error'],"#Person1#: Okay, next question. If Eric asked ...",DialogueSum,Person1 is asking the next question.\nPerson1 ...
2025,2025,test_339,BART,['##Person1# and #Person2# '],[],##Person1# and #Person2# are talking about wha...,1,['Intrinsic_Error'],"#Person1#: Okay, next question. If Eric asked ...",DialogueSum,Person1 is asking the next question.\nPerson1 ...
2026,2026,test_339,gpt4-32k-0613,[],[],Person1 and Person2 are asking each other hypo...,0,[],"#Person1#: Okay, next question. If Eric asked ...",DialogueSum,Person1 is asking the next question.\nPerson1 ...


In [18]:
from tqdm import tqdm
def get_fewshot_shot_scores(df, fewshot_str, source_type = 'Dialogue'):
    span_f1_lenient_scores = []
    span_f1_strict_scores = []
    gpt_responses = []
    row_idx = 0
    for idx, row in df.iterrows():
        if source_type == 'Dialogue':
            source = row['dialogue']
        else:
            source = row['dialogue_atomic_facts']

        summ = row['summary']
        
        nonfactual_spans = eval(row['nonfactual_spans'])
        
        # instruction = instruction_template.format(source_type = source_type)
        
        prompt = prompt_template.format(instruction = '',
                                        source_type = source_type,
                                        source = source,
                                        summ = summ) 
        prompt = f'{fewshot_str}\n{prompt}'
        
        span_f1_lenient, span_f1_strict, gpt_response= get_gpt_span_scores(prompt, nonfactual_spans)
        
       
        if row_idx % 100 == 0:
            # print(idx)
            print(prompt)
            print('***')
            print('PRED', gpt_response)
            print('ANN', nonfactual_spans)
            print(span_f1_lenient, span_f1_strict)
        print(row_idx)
        row_idx += 1
        span_f1_lenient_scores.append(span_f1_lenient)
        span_f1_strict_scores.append(span_f1_strict)
        gpt_responses.append(gpt_response)
    return span_f1_lenient_scores, span_f1_strict_scores, gpt_responses

In [19]:
instruction_template = f"Identify and list the inconsistent phrases or words in the summary. Note that consistency means all information in the summary is supported by the {{source_type}}"

prompt_template = f'{{instruction}}\n{{source_type}}: {{source}}\nSummary: {{summ}}\nInconsistent Spans ( List each span in a new line) :'


In [20]:
df = read_filter('/home/ramprasad.sa/factual_evaluation_source_based/annotations/xformer_llm_annotated.csv')
df = df[df['origin'] != 'FacEval']
df[df['factual_error'] == 1][:1]



Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,dialogue_atomic_facts
7,7,test_133,BART,['They '],[],#Person1# and #Person2# talk about the heavy s...,1,['Intrinsic_Error'],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,There was a heavy storm last night.\nThe wind ...


In [21]:
df = make_incontext_examples(df, 4)
df_fewshot = pd.read_csv('fewshot_examples.csv')

Before sampling 1050
After sampling 1045


In [22]:

span_f1_lenient_scores, span_f1_strict_scores, gpt_responses = get_zero_shot_scores(df, source_type = 'Dialogue')



0
Identify and list the inconsistent phrases or words in the summary. Note that consistency means all information in the summary is supported by the Dialogue
Dialogue: #Person1#: It was a heavy storm last night, wasn't it?
#Person2#: It certainly was. The wind broke several windows. What weather!
#Person1#: Do you know that big tree in front of my house? One of the biggest branches came down in the night.
#Person2#: Really? Did it do any damage to your home?
#Person1#: Thank goodness! It is far away from that.
#Person2#: I really hate storms. It's about time we had some nice spring weather.
#Person1#: It's April, you know. The flowers are beginning to blossom.
#Person2#: Yes, that's true. But I still think the weather is terrible.
#Person1#: I suppose we should not complain. We had a fine March after all.
Summary: #Person1# and #Person2# talk about the heavy storm last night. #Person2# thinks the weather is terrible.
Inconsistent Spans ( List each span in a new line) :
PRED ['No incons

In [25]:
df['GPTSpan-ZS_f1_len'] = span_f1_lenient_scores
df['GPTSpan-ZS_f1_exact'] = span_f1_strict_scores 
df['GPTSpan-ZS_text'] = gpt_responses
np.mean(span_f1_lenient_scores), np.mean(span_f1_strict_scores)

(0.6693916609706083, 0.47980861244019135)

In [26]:
span_f1_lenient_scores_afacts, span_f1_strict_scores_afacts, gpt_responses_afacts = get_zero_shot_scores(df, source_type = 'Source')




0
Identify and list the inconsistent phrases or words in the summary. Note that consistency means all information in the summary is supported by the Source
Source: There was a heavy storm last night.
The wind from the storm broke several windows.
There is a big tree in front of Person1's house.
One of the biggest branches from this tree came down in the night.
The fallen branch did not damage Person1's home.
Person2 hates storms.
Person2 desires nice spring weather.
It is April.
The flowers are beginning to blossom.
10.
Person2 thinks the weather is terrible.
March had fine weather.
Summary: #Person1# and #Person2# talk about the heavy storm last night. #Person2# thinks the weather is terrible.
Inconsistent Spans ( List each span in a new line) :
PRED ['#Person1# and #Person2# talk about the heavy storm last night.']
ANN []
0 0
***
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55


In [31]:
np.mean(span_f1_lenient_scores_afacts)

0.5058806106174528

In [27]:
df['GPTSpan-ZS-Afact_f1_len'] = span_f1_lenient_scores_afacts
df['GPTSpan-ZS-Afact_f1_exact'] = span_f1_strict_scores_afacts
df['GPTSpan-ZS-Afact-ZS_text'] = gpt_responses_afacts
df.head()

Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,dialogue_atomic_facts,Errors,GPTSpan-ZS_f1_len,GPTSpan-ZS_f1_exact,GPTSpan-ZS_text,GPTSpan-ZS-Afact_f1_len,GPTSpan-ZS-Afact_f1_exact,GPTSpan-ZS-Afact-ZS_text
6,6,test_133,CODS,[],[],#Person1# and #Person2# talk about the heavy s...,0,[],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,There was a heavy storm last night.\nThe wind ...,0,1.0,1.0,[No inconsistent spans.],0.0,0.0,[#Person1# and #Person2# talk about the heavy ...
7,7,test_133,BART,['They '],[],#Person1# and #Person2# talk about the heavy s...,1,['Intrinsic_Error'],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,There was a heavy storm last night.\nThe wind ...,1,1.0,0.0,"[""They think the weather is terrible but they ...",0.666667,0.0,[#Person1# and #Person2# talk about the heavy ...
8,8,test_133,MV-BART,"[""it's April "", 'They both ']",[],#Person1# and #Person2# are talking about the ...,1,['Intrinsic_Error'],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,There was a heavy storm last night.\nThe wind ...,2,0.666667,0.0,"[""They both think the weather is terrible""]",1.0,0.0,"[""They both think the weather is terrible"", ""t..."
9,9,test_133,UniLM,"['person1 ', 'but ']",[],person1 and person2 talk about the heavy storm...,1,['Intrinsic_Error'],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,There was a heavy storm last night.\nThe wind ...,2,0.666667,0.0,"[""person1 thinks the weather is terrible .""]",0.666667,0.0,"[""person1 thinks the weather is terrible.""]"
10,10,test_133,gpt4-32k-0613,[],[],Person1 and Person2 discussed the heavy storm ...,0,[],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,There was a heavy storm last night.\nThe wind ...,0,0.0,0.0,"[""Person1 also reminded Person2 that it was Ap...",0.0,0.0,[Person1 and Person2 discussed the heavy storm...


In [32]:
df_fewshot = pd.read_csv('fewshot_examples.csv')
print('FEWSHOT STR')
fewshot_str = make_fewshot_str(df_fewshot, instruction_template, prompt_template, source_type = 'Dialogue')
span_f1_lenient_scores_fewshot, span_f1_strict_scores_fewshot, gpt_responses_fewshot = get_fewshot_shot_scores(df, fewshot_str, source_type = 'Dialogue')
df['GPTSpan-FS_f1_len'] = span_f1_lenient_scores_fewshot
df['GPTSpan-FS_f1_exact'] = span_f1_strict_scores_fewshot
df['GPTSpan-FS_text'] = gpt_responses_fewshot

FEWSHOT STR
Identify and list the inconsistent phrases or words in the summary. Note that consistency means all information in the summary is supported by the Dialogue
Dialogue: #Person1#: Where are you going to spend your holidays this year, Harry?
#Person2#: We may go abroad. I'm not sure. My wife wants to go to Egypt. I'd like to go there, too. We can't make up our minds.
#Person1#: Will you travel by sea or by air?
#Person2#: We may travel by sea.
#Person1#: It's cheaper, isn't it?
#Person2#: It may be cheaper, but it takes a long time.
#Person1#: I'm sure you will enjoy yourselves.
#Person2#: Don't be so sure. We may not go anywhere. My wife always worries too much. Who's going to look after the dog? Who's going to look after the house? Who's going to look after the garden? We have to solve these things before we can go to travel.
Summary: Harry tells #Person1# they may travel by sea because Harry's wife worries too much.
Inconsistent Spans ( List each span in a new line) : Harry'

In [42]:
np.mean(span_f1_lenient_scores_fewshot)

0.6969707691689115

In [43]:
fewshot_str_source = make_fewshot_str(df_fewshot, instruction_template, prompt_template, source_type = 'Source')
span_f1_lenient_scores_fewshot_afacts, span_f1_strict_scores_fewshot_afacts, gpt_responses_fewshot_afacts = get_fewshot_shot_scores(df, fewshot_str_source, source_type = 'Source')
df['GPTSpan-FS-Afact_f1_len'] = span_f1_lenient_scores_fewshot_afacts
df['GPTSpan-FS-Afact_f1_exact'] = span_f1_strict_scores_fewshot_afacts
df['GPTSpan-FS-Afact_text'] = gpt_responses_fewshot_afacts

Identify and list the inconsistent phrases or words in the summary. Note that consistency means all information in the summary is supported by the Source
Source: Person1 asks Harry about his holiday plans for the year.
Person2 (Harry) states that they may go abroad.
Person2 states that he and his wife haven't decided their holiday plans.
Person2's wife wants to go to Egypt.
Person2 wants to go to Egypt as well.
Person1 asks if they will travel by sea or air.
Person2 states that they may travel by sea.
Person1 suggests that traveling by sea is cheaper.
Person2 states traveling by sea may be cheaper but it takes a long time.
10.
Person1 expresses certainty that Person2 will enjoy the holiday.
Person2 doubts the certainty expressed by Person1.
Person2 states they may not go anywhere.
Person2 says his wife worries a lot.
Person2's wife worries about who will look after the dog, the house, and the garden.
Person2 states they have to solve these concerns before they can go to travel.
Summary

In [44]:
np.mean(span_f1_lenient_scores_fewshot_afacts)

0.6436682615629984

In [45]:
df.keys()

Index(['Unnamed: 0', 'docid', 'model', 'nonfactual_spans', 'evidence',
       'summary', 'factual_error', 'error_type', 'dialogue', 'origin',
       'dialogue_atomic_facts', 'Errors', 'GPTSpan-ZS_f1_len',
       'GPTSpan-ZS_f1_exact', 'GPTSpan-ZS_text', 'GPTSpan-ZS-Afact_f1_len',
       'GPTSpan-ZS-Afact_f1_exact', 'GPTSpan-ZS-Afact-ZS_text',
       'GPTSpan-FS_f1_len', 'GPTSpan-FS_f1_exact', 'GPTSpan-FS_text',
       'GPTSpan-FS-Afact_f1_len', 'GPTSpan-FS-Afact_f1_exact',
       'GPTSpan-FS-Afact_text'],
      dtype='object')

In [46]:
len_zs = np.mean(df['GPTSpan-ZS_f1_len'].values)
len_zs_afact = np.mean(df['GPTSpan-ZS-Afact_f1_len'].values)
len_fs = np.mean(df['GPTSpan-FS_f1_len'].values)
len_fs_afact = np.mean(df['GPTSpan-FS-Afact_f1_exact'].values)
len_zs, len_zs_afact, len_fs,len_fs_afact

(0.6693916609706083,
 0.5058806106174528,
 0.6969707691689115,
 0.49547277284119384)

In [47]:
df.to_csv('/home/ramprasad.sa/factual_evaluation_source_based/datasets/scored/xformer_llm_pred_span_gpt.csv')

In [48]:
df

Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,...,GPTSpan-ZS_text,GPTSpan-ZS-Afact_f1_len,GPTSpan-ZS-Afact_f1_exact,GPTSpan-ZS-Afact-ZS_text,GPTSpan-FS_f1_len,GPTSpan-FS_f1_exact,GPTSpan-FS_text,GPTSpan-FS-Afact_f1_len,GPTSpan-FS-Afact_f1_exact,GPTSpan-FS-Afact_text
6,6,test_133,CODS,[],[],#Person1# and #Person2# talk about the heavy s...,0,[],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,...,[No inconsistent spans.],0.000000,0.0,[#Person1# and #Person2# talk about the heavy ...,1.000000,1.0,[None],1.000000,1.0,[None]
7,7,test_133,BART,['They '],[],#Person1# and #Person2# talk about the heavy s...,1,['Intrinsic_Error'],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,...,"[""They think the weather is terrible but they ...",0.666667,0.0,[#Person1# and #Person2# talk about the heavy ...,1.000000,0.0,[They think the weather is terrible],1.000000,0.0,[#Person1# and #Person2# talk about the heavy ...
8,8,test_133,MV-BART,"[""it's April "", 'They both ']",[],#Person1# and #Person2# are talking about the ...,1,['Intrinsic_Error'],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,...,"[""They both think the weather is terrible""]",1.000000,0.0,"[""They both think the weather is terrible"", ""t...",0.666667,0.0,[They both think the weather is terrible],0.500000,0.0,[#Person1# and #Person2# are talking about the...
9,9,test_133,UniLM,"['person1 ', 'but ']",[],person1 and person2 talk about the heavy storm...,1,['Intrinsic_Error'],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,...,"[""person1 thinks the weather is terrible .""]",0.666667,0.0,"[""person1 thinks the weather is terrible.""]",0.666667,0.0,[person1 thinks the weather is terrible .],0.666667,0.0,[person1 thinks the weather is terrible]
10,10,test_133,gpt4-32k-0613,[],[],Person1 and Person2 discussed the heavy storm ...,0,[],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,...,"[""Person1 also reminded Person2 that it was Ap...",0.000000,0.0,[Person1 and Person2 discussed the heavy storm...,1.000000,1.0,[None],0.000000,0.0,[Person1 also reminded Person2 that it was Apr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2007,2007,13681241,BART,[],[],Callum is busy.,0,[],Jair: Still busy?\r\nCallum: Yes a little sorr...,SAMSum,...,[No inconsistent spans.],1.000000,1.0,[No inconsistent spans.],1.000000,1.0,[None],1.000000,1.0,[None]
2008,2008,13681241,MV-BART,[],[],Callum is still busy and sorry.,0,[],Jair: Still busy?\r\nCallum: Yes a little sorr...,SAMSum,...,[No inconsistent spans.],1.000000,1.0,[No inconsistent spans.],1.000000,1.0,[None],1.000000,1.0,[None]
2009,2009,13681241,UniLM,[],[],Callum is still busy .,0,[],Jair: Still busy?\r\nCallum: Yes a little sorr...,SAMSum,...,[No inconsistent spans.],1.000000,1.0,[No inconsistencies found.],1.000000,1.0,[None],1.000000,1.0,[None]
2010,2010,13681241,gpt4-32k-0613,[],[],"Jair checks if Callum is still busy, and Callu...",0,[],Jair: Still busy?\r\nCallum: Yes a little sorr...,SAMSum,...,[No inconsistent spans],1.000000,1.0,[No inconsistent spans.],1.000000,1.0,[None],1.000000,1.0,[None]
