In [11]:
import utils_llama as ulla
from openai import OpenAI
from tqdm import tqdm
import pandas as pd
import pickle
import random
import re

## Error analysis

I put all output sentences in a .txt file, went through all mistakes and classified them either Propbank, polarity, deictic gestures, gesture information, AMR misinterpretation, incoherent or other. First I will count the results, then I will refine the prompt to hopefully improve the output.

In [83]:
total_sentences = 0
with open("error_analysis_annotated.txt", "r", encoding="utf-8") as file:
    for line in file:
        line = line.strip().lower()
        if not line:
            continue
        total_sentences += 1
print(total_sentences)

573


In [23]:
deictic, propbank, polarity, gesture, amr, incoherent, other, no_mistake, total = ulla.count_error_types("error_analysis_annotated.txt")

print(f"Deictic: {deictic}")
print(f"Propbank: {propbank}")
print(f"Polarity: {polarity}")
print(f"Gesture Information: {gesture}")
print(f"AMR Misinterpretation: {amr}")
print(f"Incoherent: {incoherent}")
print(f"Other: {other}")
print(f"No mistake: {no_mistake}")
print(f"Total number of mistakes: {total}")

Deictic: 46
Propbank: 25
Polarity: 23
Gesture Information: 231
AMR Misinterpretation: 176
Incoherent: 44
Other: 119
No mistake: 41
Total number of mistakes: 664


## Prompt refinement
Load the results from the Llama run. From these results I picked out 12 sentences to prompt Llama, this time with a refined prompt.

In [13]:
results_llama = 'last_results_llama.pkl'
results = pickle.load(open(results_llama, 'rb'))

In [108]:
ref_transl_expl = []
for item in results:
    for key in ["llama_1", "llama_2", "llama_3"]:
        if item['sentence'] == "good enough":
            print(item[key])
        #print(item[key])
        try:
            sentences = re.findall(r'"sentence"\s*:\s*"([^"]+)"', item[key])
            explanations = re.findall(r'"explanation"\s*:\s*"([^"]+)"', item[key])
            amrs = re.findall(r'"speech AMR"\s*:\s*"([^"]+)"', item[key])
            # print(sentences)
            # print(explanations)
            for sent, expl, amr in zip(sentences, explanations, amrs):
                # if item['sentence'] == "doesn't matter":
                #     print(sent)
                #print(sent)
                ref_transl_expl.append({
                    "reference": item["sentence"],
                    "scenario": item['scenario'],
                    "llama_run": key,
                    "translation": sent.lower(),
                    "explanation": expl
                })
                
        except (json.JSONDecodeError, TypeError) as e:
            print(f"Failed to parse {key} {item[key]}\nbecause: {e}.")

[{"sentence": "It is good enough.", "explanation": "The speech AMR indicates that the predicate is 'enough' and it has an argument 'good'. This suggests a comparison or evaluation, where something is deemed sufficient or satisfactory. The sentence generated is a common idiomatic expression in English to convey this meaning."}]
[{"sentence": "It is good enough.", "explanation": "The speech AMR represents a sentence with the predicate 'enough' in the sense of 'good enough'. The ARG2 slot indicates that the object of the sentence is 'good', which is a complement to the adjective 'enough'. Therefore, the generated sentence is 'It is good enough.'"}]
[{"sentence": "It is good enough.", "explanation": "The speech AMR indicates that the predicate 'enough' is being used, which means something meets a certain standard. The ARG2 slot specifies the argument for this predicate, which in this case is 'good'. This suggests that the subject (not explicitly mentioned) has reached or surpassed an accep

In [75]:
print(len(ref_transl_expl))

198


In [30]:
## Load the test and train set I created
example_dataframe = pd.read_pickle("example_df.pkl")
test_df = pd.read_pickle('test_df.pkl')

## Create smaller dataframe of specific sentence I want to further explore
target_sentences = [
    "so put put a block on the back block good",                 ## speech amr and gesture labels (didnt perform well)
    "no that doesn't look like it’s going to work",              ## polarity (didnt perform well)
    "and the one on the very end",                               ## gesture amr isnt great (speech performed well, gesture not) 
    "push them up a little bit",                                 ## gesture amr and labels (speech performed perfect, gesture not)
    "and then you got two towers like that on the sides",        ## 
    "and then you do that same process all the way up",          ## gesture info is nice however llama doesnt seem to use the amr
    "two more blocks up",                                        ## gesture amr and labels are representative 
    "and uh one block uh ahead of it",
    "move those together a little bit closer and then put that on top yay",    ## gesture info
    "just like that but closer together",                        ## gesture info could be useful, speech and gesture dont perform well
    "go near yeah",                                              ## gesture info useful, speech performed well
    "can you just open it open it a little bit uh"               ## gesture info useful, speech performed well
]
df_subset = test_df[test_df['sentence'].isin(target_sentences)].copy()
print(df_subset)

        file                                           sentence  \
0   p15_gold          so put put a block on the back block good   
0   p15_gold          so put put a block on the back block good   
0   p15_gold          so put put a block on the back block good   
4   p15_gold       no that doesn't look like it’s going to work   
4   p15_gold       no that doesn't look like it’s going to work   
4   p15_gold       no that doesn't look like it’s going to work   
18  p18_gold                 just like that but closer together   
18  p18_gold                 just like that but closer together   
18  p18_gold                 just like that but closer together   
20  p18_gold  move those together a little bit closer and th...   
20  p18_gold  move those together a little bit closer and th...   
20  p18_gold  move those together a little bit closer and th...   
22  p19_gold                                 two more blocks up   
22  p19_gold                                 two more blocks u

In [32]:
# Turn example df into a list of dicts
examples = []

for idx, row in example_dataframe.iterrows():
    examples.append({
        "sentence": row.get("sentence"),
        "speech_amr": row.get("speech_amr"),
        "gesture_amrs": row.get("gesture_amrs"),
        "gesture_labels": row.get("gesture_labels"),
        "num_gesture_amrs": row.get("num_gesture_amrs"),
    })

for item in examples:
    print(item['sentence'])

okay
space two out a little less than a block length
four blocks in the front okay
it cannot fall down from the ground right
five blocks to start
now these are a little jiggled
you're done
put a bit a bit right uh
it starts in the top left
same direction that that's pushed off yea alright
it's gonna be a pyramid from three of the rows of two
then they connect to the two wider blocks
okay
get another one stacked on top
move that block uh
they’re in other words are not perfectly clear
them towards you
then you going to have a fourth block
just one
and go
the base is going to have four second one’s going to have three then two on top of that and then one
start off with just a block and then put a block on top of
stack stack it up no
stack three blocks on one side yup
like about a third of a block a part so more close in than that
four blocks
great yep
seven blocks in a row
and then three more coming off from the other direction touching the corner okay
and then the second block goes on to

In [22]:
# Point to the server
client = OpenAI(base_url="http://localhost:8000/v1", api_key="cltl")

In [23]:
random.seed(12)

## First error analysis run
results_ea = []
prompts = ulla.generate_prompt_error_anaysis(df_subset, examples)
total_prompts = len(prompts)
for prompt in tqdm(prompts, total=total_prompts, desc="Processing", unit="row"):
    call_llama = ulla.query_LLM_multiple(client, prompt["prompt"], temp=0.3, n=3)
    results_ea.append({
        "prompt": prompt["prompt"],
        "sentence": prompt["meta"]["sentence"],
        "scenario": prompt["meta"]["scenario"],
        "file": prompt["meta"]["file"],
        "llama_1": call_llama[0],
        "llama_2": call_llama[1],
        "llama_3": call_llama[2]
    })

Processing: 100%|██████████| 36/36 [1:20:19<00:00, 133.88s/row]


In [163]:
## Save results to file each run since I'm restarting the kernel
file_ea1 = "first_ea.pkl"
pickle.dump(results_ea, open(file_ea1, 'wb'))

In [34]:
with open("first_ea.pkl", "rb") as f:
    first_ea = pickle.load(f)

In [36]:
ref_and_transl = []

for item in first_ea:
    for key in ["llama_1", "llama_2", "llama_3"]:
        try:
            sentences = re.findall(r'"sentence"\s*:\s*"([^"]+)"', item[key])
            explanations = re.findall(r'"explanation"\s*:\s*"([^"]+)"', item[key])
            for sent, expl in zip(sentences, explanations):
                ref_and_transl.append({
                    "reference": item["sentence"],
                    "scenario": item['scenario'],
                    "llama_run": key,
                    "translation": sent.lower(),
                    "explanation": expl
                })
                
        except (json.JSONDecodeError, TypeError) as e:
            print(f"Failed to parse {key} {item[key]}\nbecause: {e}.")

for item in ref_and_transl:
    if item['reference'] == "can you just open it open it a little bit uh":
        print(item)

{'reference': 'can you just open it open it a little bit uh', 'scenario': 'speech', 'llama_run': 'llama_1', 'translation': 'just open it a bit, please.', 'explanation': "I interpreted the speech AMR by identifying the main action 'open-01' and its arguments. The mode is imperative, indicating that the speaker is giving a command, which corresponds to the polite request in the generated sentence. The modifier 'just' was used to convey the idea of doing something slightly or minimally, as indicated by the degree 'bit' with the modifier 'little'. The pronouns 'you' and 'it' were identified from the ARG0 and ARG1 slots, respectively."}
{'reference': 'can you just open it open it a little bit uh', 'scenario': 'speech', 'llama_run': 'llama_3', 'translation': 'just open it a bit.', 'explanation': "I interpreted the speech AMR by focusing on the main action 'open-01' and its arguments. The mode is imperative, indicating that the speaker is giving a command, which matches the polite tone. The m

In [56]:
prompt = """Do you know the difference between look-01 and look-02 in Propbank?"""
print(ulla.query_LLM(client, prompt, temp=0.5))

A very specific question!

PropBank is a large-scale, linguistically motivated semantic role labeling (SRL) resource that annotates predicate-argument structures for verbs in a sentence. It's a comprehensive dataset used to train and evaluate SRL systems.

After checking the documentation and resources related to PropBank, I found that "look-01" and "look-02" are two different verb sense annotations in PropBank. Specifically:

* `look-01` represents the verb sense of looking at or observing something with one's eyes (e.g., "She looked at the picture.").
* `look-02` represents the verb sense of searching for something, often implying a more active or intentional search (e.g., "He looked for his keys.").

These two senses are distinguished based on the meaning and context in which the verb is used. The PropBank annotation scheme uses this distinction to capture the nuances of verb meaning and its interactions with other elements in the sentence.

Is there anything else I can help you wit

In [59]:
prompt = """Look at the following Abstract Meaning Representation
(p/put-01
	:mode imperative
	:ARG0 (y/you)
	:ARG1 (b/block
		:quant 1)
	:ARG2 (b2/block
		:mod (b3/back)))
Do you know what ARG0, ARG1 and ARG2 indicate?
"""
print(ulla.query_LLM(client, prompt, temp=0.5))

In an Abstract Meaning Representation (AMR), the notation `:ARGn` refers to the arguments of a predicate or relation.

* `ARG0` is typically the "doer" or the subject of the action described by the sentence. In this case, it's the person performing the action.
* `ARG1` represents the direct object or the thing affected by the action. Here, it's the specific block (one instance).
* `ARG2` represents the indirect object or a secondary participant in the action. In this example, it's another block that is somehow related to the first block.

In more detail:

* You are performing an action (putting something somewhere)
* The direct object of your action is one specific block
* The secondary participant involved is another block, which is specified as being "back" (implying it's on the back side of something or in a position that's not directly related to the main action)

The AMR format allows for more nuance and detail about the relationships between entities in a sentence than traditiona

In [38]:
total_sentences = 0
with open("refined1.txt", "r", encoding="utf-8") as file:
    for line in file:
        line = line.strip().lower()
        if not line:
            continue
        total_sentences += 1
print(total_sentences)

## Count errors in the 12 selected sentences
deictic, propbank, polarity, gesture, amr, incoherent, other, no_mistake, total = ulla.count_error_types("refined1.txt")

print(f"Deictic: {deictic}")
print(f"Propbank: {propbank}")
print(f"Polarity: {polarity}")
print(f"Gesture Information: {gesture}")
print(f"AMR Misinterpretation: {amr}")
print(f"Incoherent: {incoherent}")
print(f"Other: {other}")
print(f"No mistake: {no_mistake}")
print(f"Total number of mistakes: {total}")

Deictic: 5
Propbank: 10
Polarity: 4
Gesture Information: 57
AMR Misinterpretation: 39
Incoherent: 5
Other: 14
No mistake: 9
Total number of mistakes: 134


In [24]:
## Second error analysis run --> prompt changed slightly
random.seed(12)

results_ea2 = []
prompts = ulla.generate_prompt_error_anaysis(df_subset, examples)
total_prompts = len(prompts)
for prompt in tqdm(prompts, total=total_prompts, desc="Processing", unit="row"):
    call_llama = ulla.query_LLM_multiple(client, prompt["prompt"], temp=0.3, n=3)
    results_ea2.append({
        "prompt": prompt["prompt"],
        "sentence": prompt["meta"]["sentence"],
        "scenario": prompt["meta"]["scenario"],
        "file": prompt["meta"]["file"],
        "llama_1": call_llama[0],
        "llama_2": call_llama[1],
        "llama_3": call_llama[2]
    })

Processing: 100%|██████████| 36/36 [1:19:46<00:00, 132.96s/row]


In [72]:
## Save results to file each run since I'm restarting the kernel
file_ea2 = "second_ea.pkl"
pickle.dump(results_ea2, open(file_ea2, 'wb'))

In [50]:
with open("second_ea.pkl", "rb") as f:
    second_ea = pickle.load(f)

In [7]:
ref_and_transl = []

for item in second_ea:
    for key in ["llama_1", "llama_2", "llama_3"]:
        try:
            sentences = re.findall(r'"sentence"\s*:\s*"([^"]+)"', item[key])
            explanations = re.findall(r'"explanation"\s*:\s*"([^"]+)"', item[key])
            for sent, expl in zip(sentences, explanations):
                ref_and_transl.append({
                    "reference": item["sentence"],
                    "scenario": item['scenario'],
                    "llama_run": key,
                    "translation": sent.lower(),
                    "explanation": expl
                })
                
        except (json.JSONDecodeError, TypeError) as e:
            print(f"Failed to parse {key} {item[key]}\nbecause: {e}.")

for item in ref_and_transl:
    if item['reference'] == "doesn't matter":
        print(item)

NameError: name 'second_ea' is not defined

In [55]:
## Count errors in the 12 selected sentences
deictic, propbank, polarity, gesture, amr, incoherent, other, no_mistake, total = ulla.count_error_types("refined2.txt")

print(f"Deictic: {deictic}")
print(f"Propbank: {propbank}")
print(f"Polarity: {polarity}")
print(f"Gesture Information: {gesture}")
print(f"AMR Misinterpretation: {amr}")
print(f"Incoherent: {incoherent}")
print(f"Other: {other}")
print(f"No mistake: {no_mistake}")
print(f"Total number of mistakes: {total}")

Deictic: 2
Propbank: 11
Polarity: 6
Gesture Information: 56
AMR Misinterpretation: 30
Incoherent: 7
Other: 19
No mistake: 5
Total number of mistakes: 131
