In [34]:
import json
import string
import difflib
import pandas as pd
from tqdm import tqdm
from scripts.utils import read_jsonl
import argparse
import nltk
from nltk.corpus import stopwords

# Download stopwords list if not already downloaded
nltk.download('stopwords')
# Get the English stopwords list from NLTK
stop_words = set(stopwords.words('english'))

def postprocess(txt):
    txt = [w.strip(string.punctuation) for w in txt.split(' ')]
    txt = ' '.join(txt)
    return txt


def find_edited_spans(s1, s2):
    
    s1 = s1.strip(string.punctuation)
    s2 = s2.strip(string.punctuation)
    differ = difflib.Differ()
    diff = list(differ.compare(s1.split(), s2.split()))

    edited_spans = []
    current_span = ""
    printed = False
    for item in diff:
        code, word = item[0], item[2:]
        
        if code == ' ':
            if current_span:
                # print(current_span)
                # if not len([w for w in current_span.split() if w not in stop_words]):
                #     print(current_span)
                    # printed = True
                if len([w for w in current_span.split() if w not in stop_words]):
                    edited_spans.append(current_span.strip())
                current_span = ""
        elif code == '-':
            current_span += word + " "

    
    if current_span:
        if len([w for w in current_span.split() if w not in stop_words]):
            edited_spans.append(current_span.strip())
    
    return edited_spans



def get_nonfactual_spans(before_summary_sents, after_summary_sents):
    before_summary_sents = postprocess(before_summary_sents)
    after_summary_sents = postprocess(after_summary_sents)
    
    nonfactual_spans_processed = find_edited_spans(before_summary_sents, after_summary_sents)
    # if before_summary_sents != after_summary_sents:
    # if True:
        # print(before_summary_sents)
        # print(after_summary_sents)
        # print(nonfactual_spans_processed)
        # print('**'* 13)
    return nonfactual_spans_processed



def get_summary_sentences(cand_keys, summid):
    source = []
    summary = []
    annotated_spans = []
#     print(cand_keys, len(cand_keys))
    for sent_num in range(0, len(cand_keys)):
        sent_id = f'{summid}:{sent_num}'
        if sent_id in cand_keys:
            
            sent_ann = cand_keys[sent_id]
            if not source:
                source = sent_ann['input_lines']
        
            for summline in sent_ann['prev_summ_lines'] + [sent_ann['before_summary_sent']]:
                        if summline not in summary:
                            summary.append(summline)

            sent_annotated_spans = get_nonfactual_spans(sent_ann['before_summary_sent'],
                                                               sent_ann['after_summary_sent'])
            annotated_spans += sent_annotated_spans
                
    return source, summary, annotated_spans


    # df_dict.to_csv(args.write_path)
# return df_dict

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ramprasad.sa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
read_path = '/home/ramprasad.sa/probing_summarization_factuality/datasets/annotations/genaudit_data_final_2feb.jsonl'
data = read_jsonl(read_path)

df_dict = {
        'id': [],
        'source': [],
        'summary': [],
        'annotated_spans': [],
        'model': [],
        'origin': []
}

for dat in data:
            did = dat['id']
            # if 'Rachel Usher#XSUM-34814828:llama7b-ul2' in did:
            sentidx = did.split(':')[-1]
            summid = ':'.join(did.split(':')[:-1])
            model = summid.split(':')[-1].split('-')[0]
            origin = summid.split('#')[1].split('-')[0]
            if summid not in df_dict['id']:
                cand_keys = {dat['id'] : dat for dat in data if summid in dat['id']}
                # print(cand_keys)
                source, summary, annotated_spans = get_summary_sentences(cand_keys, summid)

                df_dict['id'].append(summid)
                df_dict['source'].append('. '.join(source))
                df_dict['summary'].append('. '.join(summary))
                df_dict['annotated_spans'].append('<sep>'.join(annotated_spans))
                df_dict['model'].append(model)
                df_dict['origin'].append(origin)
#             break
df_dict = pd.DataFrame(df_dict)

after 
was in 
after 
as 
no 
has 
and 
to 
to 
an 
the 
it 
with 
or 
while 
that 
is 
of 
the 
up 
a 
and 
some 
any 
an 
was 
own 
i 
they 
i don't 
to 
of 
with 
in the 
at 
their 
in 
from 
and 
him 
him 
and 
and 
they 
but 
it 
is 
is 
as 
during 
because of 
are 
with 
by 
but 
he 
on 
were 
in 
a 
do 
on how 
some 
her 
and he 
and 
but 
then 
for 
for 
of the 
this has been 
we 
she 
a 
and 
and 
at 
had to have 
with 


In [36]:
dat

{'id': 'Rachel Usher#XSUM-39387550:chatgpt-ul2:5',
 'input_lines': ['US President Donald Trump has withdrawn his healthcare bill after it failed to gain enough support to pass in Congress.',
  'House Speaker Paul Ryan said he and Mr Trump agreed to pull the vote, after it became apparent it would not get the minimum of 215 Republican votes needed.',
  'The last minute move was seen as a huge blow to Mr Trump.',
  'Repealing and replacing the programme known as Obamacare was one of his major election pledges.',
  'Earlier on Friday, White House press secretary Sean Spicer said that the vote would go ahead at 15:30 (19:30 GMT).',
  "Mr Trump had reportedly warned Republicans that if they did not vote for his bill then they would be stuck with Barack Obama's healthcare programme for good.",
  'However, multiple reports suggested that between 28 and 35 Republicans were opposed to his draft American Health Care Act.',
  'The vote was withdrawn shortly after 15:30, and the House is now in re

In [4]:
df_dict[df_dict['id'] == 'Rachel Usher#XSUM-34814828:llama7b-ul2']

Unnamed: 0,id,source,summary,annotated_spans,model,origin
600,Rachel Usher#XSUM-34814828:llama7b-ul2,Surely some of the first rules of wooing are: ...,The European Union is struggling to cope with ...,proposed<sep>this has been<sep>slowing,llama7b,XSUM
