In [13]:
import pandas as pd
import json
import os
import time
import requests
from tqdm import tqdm
import itertools


def get_response_llama3(prompt, temperature=0.67, max_tokens=700):
    
    endpoint = 'https://api.together.xyz/v1/chat/completions'
    res = requests.post(endpoint, json={
        "model": "meta-llama/Llama-3-70b-chat-hf",
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": 0.7,
        "top_k": 50,
        "repetition_penalty": 1,
        "stop": [
            "<|eot_id|>"
        ],
        "messages": [
            {
                "content": prompt,
                "role": "user"
            }
        ]
    }, headers={
        "Authorization": 'api-key',
    })  
    
    return res.json()['choices'][0]['message']['content']


def load_jsonl(path):
    data=[]
    with open(path, 'r', encoding='utf-8') as reader:
        for line in reader:
            data.append(json.loads(line))
    return data 

def save_jsonl(name, data):
    with open(name, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')
            

In [2]:
vs_stimuli = ['MarcusYam', 'SylviaEarle', 'NaomiDeLaRosa', 'RobinSteinberg']
sd_stimuli = ['AuntMother', 'Ferguson', 'Sept11', 'NoHandbook']
list_of_stimuli = sd_stimuli+vs_stimuli

In [3]:
prompt_filenames = os.listdir('./Prompts/zero_shot/')
prompt_dict = {}
for p in prompt_filenames:
    if '.txt' not in p:
        continue
    with open('./Prompts/zero_shot/'+p, 'r') as f:
        prompt = f.read()
        
    prompt_dict[p.split('.')[0]] = prompt 

In [6]:
original_narrative_dict = {}
for stimulus in list_of_stimuli:
    with open('../data/BATS/originalstories/'+stimulus+'.txt', 'r') as f:
        narrative = f.read()
    original_narrative_dict[stimulus] = narrative

## generate MCs 

In [None]:
save_dir = './Responses/zero_shot/llama3/original_narrative_input/monte-carlo-same-temp/'
temperature = 0.67

for sample_num in range(5):
    for prompt_name, template in prompt_dict.items():

        save_data = []
        for idx, stimulus in tqdm(enumerate(list_of_stimuli)):

            original_narrative = original_narrative_dict[stimulus]
            prompt = template.format(original_narrative)
            
            metadata = {}
            metadata['Prompt'] = prompt
            metadata['Stimuli'] = stimulus
            metadata['original_narrative'] = original_narrative
            metadata['row'] = idx

            response = get_response_llama3(prompt, temperature=temperature, max_tokens=700)
            metadata['response'] = response
            save_data.append(metadata)
            save_jsonl(save_dir+prompt_name+'_temp_'+str(temperature)+'_'+str(sample_num)+'.jsonl', save_data)
            time.sleep(2)
        save_jsonl(save_dir+prompt_name+'_temp_'+str(temperature)+'_'+str(sample_num)+'.jsonl', save_data)

## decompose MCs

In [None]:
import json
import time
import os, re
import requests
import numpy as np

endpoint = 'https://api.together.xyz/v1/chat/completions'
model = "meta-llama/Llama-3-70b-chat-hf" 

def construct_clean_up_prompt(text):
    instruction = '''Please format the given text into plain text format, without any formatting, numbers or headings (e.g., 'Tragic Event', "Thumb") or introductory text (e.g., Here is the list). Do not reword the text. Only return a list and nothing else (e.g., phrases like 'Here is the list of main concepts in plain text format:').'''
    instruction+='\n\nText: \n\n'+text
    return instruction
            
            
with open('./Prompts/decomposition_decontextualization.txt', 'r') as f:
    decomposition_prompt = f.read()
    

input_dir = './Responses/zero_shot/llama3/original_narrative_input/monte-carlo-same-temp/'
response_files = os.listdir(input_dir)
response_files = [r for r in response_files if '.jsonl' in r]

for r in tqdm(response_files):
    data = load_jsonl(input_dir+r)
    
    if 'decomposed_mcs' in data[-1]:
        continue
    
    for d in tqdm(data):
        if 'decomposed_mcs' in d:
            continue
        raw_response = d['response']
        
        mc_list = get_response_llama3(construct_clean_up_prompt(raw_response), temperature=0.0, max_tokens=1000).split('\n')
        time.sleep(2)
        
        decomposed_mcs = []
        for m in mc_list:
            if m=='': continue
            atomic_mcs = get_response_llama3(decomposition_prompt.format(m), temperature=0.0, max_tokens=512).split('\n')
            atomic_mcs = [a.strip('-').strip(' ') for a in atomic_mcs]
            decomposed_mcs+=atomic_mcs
            time.sleep(10)
        
            
        d['decomposed_mcs'] = decomposed_mcs
        save_jsonl(input_dir+r, data)
    
        time.sleep(40)  

In [15]:
input_dir = './Responses/zero_shot/llama3/original_narrative_input/monte-carlo-same-temp/'
response_files = os.listdir(input_dir)
response_files = [r for r in response_files if '.jsonl' in r]

data = load_jsonl(input_dir+response_files[0])

data[0].keys()

dict_keys(['Prompt', 'Stimuli', 'original_narrative', 'row', 'response', 'decomposed_mcs'])

### analysis of decomposed MCs

In [32]:
import numpy as np

def get_complexity(bullet_points: list) -> float:
    ''' 
    Complexity: the average number of words per bullet point.

    Args:
      bullet_points: list of strings, each string is a bullet point/main concept. Returned from get_yield() with return_sents=True
    '''

    total_words = 0
    for sent in bullet_points:
        total_words += len(sent.split()) # split by whitespace

    return np.round(total_words / len(bullet_points), 2)

In [33]:
complexity_data = []
for r in tqdm(response_files):
    data = load_jsonl('./Responses/zero_shot/llama3/original_narrative_input/monte-carlo-same-temp/'+r)
    for d in data:
        stimuli = d['Stimuli']
        decomposed_mcs = d['decomposed_mcs']
        complexity = get_complexity(decomposed_mcs)
        d['decomposed_mc_complexity'] = complexity
        complexity_data.append({'Stimuli':stimuli, 'prompt': r.split('.')[0], 'complexity': complexity})
        
complexity_data = pd.DataFrame(complexity_data)

100%|████████████████████████████████████████████████████████████████████████████| 35/35 [00:00<00:00, 1151.20it/s]


In [34]:
print('======Complexity of generated MCs======')
for s in list_of_stimuli:
    m = complexity_data[complexity_data.Stimuli==s]['complexity'].mean()
    print(s, np.round(m, 2))

AuntMother 7.75
Ferguson 9.3
Sept11 8.48
NoHandbook 9.19
MarcusYam 8.82
SylviaEarle 9.03
NaomiDeLaRosa 7.77
RobinSteinberg 8.93


In [35]:
def get_main_concepts(stimuli_name: str) -> dict:
    with open('../data/BATS/goldMCs/' + stimuli_name + '.json', 'r') as f:
        concepts = json.load(f)
    return concepts

print('======Complexity of gold MCs======')
for s in list_of_stimuli:
    print(s, get_complexity(list(get_main_concepts(s).values())))

AuntMother 8.71
Ferguson 8.8
Sept11 7.0
NoHandbook 8.64
MarcusYam 7.18
SylviaEarle 8.75
NaomiDeLaRosa 8.12
RobinSteinberg 7.57
