In [15]:
import pandas as pd
import json
import os
import time
import requests
from tqdm import tqdm
import json
import numpy as np
from nltk.tokenize import sent_tokenize
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4")

def num_tokens_from_string(string, encoding):
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoding.encode(string))
    return num_tokens

def load_jsonl(path):
    data=[]
    with open(path, 'r', encoding='utf-8') as reader:
        for line in reader:
            data.append(json.loads(line))
    return data 

def save_jsonl(name, data):
    with open(name, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

def get_yield(sentences):
    num_toks = 0
    for s in sentences:
        num_toks+=len(s.split())
    return num_toks

def get_complexity(bullet_points: list) -> float:
    ''' 
    Complexity: the average number of words per bullet point.

    Args:
      bullet_points: list of strings, each string is a bullet point/main concept. Returned from get_yield() with return_sents=True
    '''

    total_words = 0
    for sent in bullet_points:
        total_words += len(sent.split()) # split by whitespace

    return np.round(total_words / len(bullet_points), 2)

def get_response_llama3(prompt, temperature=0.67, max_tokens=700):
    
    endpoint = 'https://api.together.xyz/v1/chat/completions'
    res = requests.post(endpoint, json={
        "model": "meta-llama/Llama-3-70b-chat-hf",
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": 0.7,
        "top_k": 50,
        "repetition_penalty": 1,
        "stop": [
            "<|eot_id|>"
        ],
        "messages": [
            {
                "content": prompt,
                "role": "user"
            }
        ]
    }, headers={
        "Authorization": 'api-key',
    })  
    
    return res.json()['choices'][0]['message']['content']

In [16]:
prompt_filenames = os.listdir('./Prompts/zero_shot/')
prompt_dict = {}
for p in prompt_filenames:
    if 'ipynb_checkpoints' in p: continue
    with open('./Prompts/zero_shot/'+p, 'r') as f:
        prompt = f.read()
        
    prompt_dict[p.split('.')[0]] = prompt 

In [17]:
data = load_jsonl('../data/narrasum/narrasum_sampled_data.jsonl')

dataset = {}
for idx, d in enumerate(data):
    sentences = sent_tokenize(d['summary'])
    dataset[idx] = {
                    'summary': d['summary'],\
                    'document': d['document'],\
                    'summary_sentences': sentences,\
                    'num_summary_sentences': len(sentences),\
                    'complexity_summary_sentences': get_complexity(sentences),\
                    'yield_summary_sentences': get_yield(sentences),\
                    'yield_document_sentences': get_yield(sent_tokenize(d['document']))
                   }

In [None]:
save_dir = './Responses/narrasum/zero_shot/llama3/original_narrative_input/monte-carlo-same-temp/'
temperature = 0.67

for sample_num in [0,1,2,3,4]:
    for prompt_name, template in prompt_dict.items():
        save_file_name = save_dir+prompt_name+'_temp_'+str(temperature)+'_'+str(sample_num)+'.jsonl'
       
        # Initialize save_data
        save_data = []
        start_idx = 0
        
        # Check if file exists and load existing data
        if os.path.exists(save_file_name):
            save_data = load_jsonl(save_file_name)
            if len(save_data) == len(dataset):  # If complete, skip to next file
                print(f"Skipping completed file: {save_file_name}")
                continue
            
            # Find the last processed index
            if save_data:
                last_processed = max(d['row'] for d in save_data)
                start_idx = last_processed + 1
                print(f"Resuming from index {start_idx} for {save_file_name}")
            
        for idx in tqdm(range(start_idx, len(dataset)), initial=start_idx, total=len(dataset)):

            original_narrative = dataset[idx]['document']
            prompt = template.format(original_narrative)

            metadata = {}
            metadata['Prompt'] = prompt
            metadata['original_narrative'] = original_narrative
            metadata['row'] = idx

            response = get_response_llama3(prompt, temperature=temperature, max_tokens=2048)
            metadata['response'] = response
            save_data.append(metadata)
            
            save_jsonl(save_dir+prompt_name+'_temp_'+str(temperature)+'_'+str(sample_num)+'.jsonl', save_data)
            
            time.sleep(2)
            
            if idx==50:
                time.sleep(10)
            
        save_jsonl(save_dir+prompt_name+'_temp_'+str(temperature)+'_'+str(sample_num)+'.jsonl', save_data)

### decompose mcs

In [None]:
import json
import time
import os, re
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools

endpoint = 'https://api.together.xyz/v1/chat/completions'
model = "meta-llama/Llama-3-70b-chat-hf" 


def construct_clean_up_prompt(text):
    instruction = '''Please format the given text into plain text format, without any formatting, numbers or headings (e.g., 'Tragic Event', "Thumb") or introductory text (e.g., Here is the list). Do not reword the text. Only return a list and nothing else (e.g., phrases like 'Here is the list of main concepts in plain text format:').'''
    instruction+='\n\nText: \n\n'+text
    return instruction
            
            
with open('./Prompts/decomposition_decontextualization.txt', 'r') as f:
    decomposition_prompt = f.read()
    
    
### Run decomposer for generated MCs ####
input_dir = './Responses/narrasum/zero_shot/llama3/original_narrative_input/monte-carlo-same-temp/'
response_files = os.listdir(input_dir)
response_files = [r for r in response_files if '.jsonl' in r]


for r in tqdm(response_files):
    data = load_jsonl(input_dir+r)
    
    if 'decomposed_mcs' in data[-1]:
        continue

    
    for d in tqdm(data):
        if 'decomposed_mcs' in d:
            continue
        raw_response = d['response']
        
        mc_list = get_response_llama3(construct_clean_up_prompt(raw_response), temperature=0, max_tokens=1000).split('\n')
        time.sleep(2)
        
        decomposed_mcs = []
        for m in mc_list:
            if m=='': continue
            atomic_mcs = get_response_llama3(decomposition_prompt.format(m), 512).split('\n')
            atomic_mcs = [a.strip('-').strip(' ') for a in atomic_mcs]
            decomposed_mcs+=atomic_mcs
            time.sleep(10)
        
            
        d['decomposed_mcs'] = decomposed_mcs
        save_jsonl(input_dir+r, data)
    
        time.sleep(40)  

### Decomposer for gold MCs (this step is needed as the original dataset has free text summaries)

In [None]:

from nltk.tokenize import sent_tokenize

data = load_jsonl('../data/narrasum/narrasum_sampled_data.jsonl')

for d in tqdm(data):
    sentences = sent_tokenize(d['summary'])
    
    decomposed_mcs = []
    for m in sentences:
        if m=='': continue
        atomic_mcs = get_response_gpt4(decomposition_prompt.format(m), max_tokens).split('\n')
        
        atomic_mcs = [a.strip('-').strip(' ') for a in atomic_mcs]
        
        decomposed_mcs+=atomic_mcs
        time.sleep(1)  
        

    d['decomposed_summary_sentences_gpt4'] = decomposed_mcs
    
save_jsonl('../data/narrasum/narrasum_sampled_data_with_decomposed_mcs_gpt4.jsonl', data)