## Creating the SAGE-AMR dataset

In [20]:
from collections import defaultdict, Counter
import process_data as proda
import pandas as pd
import pickle
import json
import re
import spacy
import os

In [50]:
## If you want to check the output per individual file
file_path = "gold/p15_gold.txt"
extracted = proda.extract_data_from_txt(file_path)
merge = proda.merge_speech_and_gesture_with_amr(extracted, filename="p19_gold.txt")
for item in merge:
    if item.get("sentence"):
        item["sentence"] = item["sentence"].lower()
    print(item)

{'file': 'p19_gold', 'begin_time': 1.08, 'end_time': 2.32, 'sentence': 'all right put a block down', 'speech_amr': '(p / put-01   :mode imperative   :ARG0 (y / you)   :ARG1 (b / block    :quant 1     :direction (d / down))  :mod (a / all-right))', 'gesture_labels': ['RA: move, front left; RH: into claw, down;', 'body: still;'], 'gesture_amrs': ['(g / gesture-unit  :op1 (i / icon-GA   :ARG0 (s / signaler)       :ARG1 (p / put-01)       :ARG2 (a / actor))  :op2 (d / deixis-GA   :ARG0 s       :ARG1 (l / location)       :ARG2 a))']}
{'file': 'p19_gold', 'begin_time': 5.28, 'end_time': 7.03, 'sentence': 'um', 'speech_amr': '(u / um :mode expressive)', 'gesture_labels': ['body: still;', 'LA: move, front, into gap, front; LH: into claw, down;'], 'gesture_amrs': []}
{'file': 'p19_gold', 'begin_time': 7.07, 'end_time': 10.36, 'sentence': 'put a block one block apart behind', 'speech_amr': '(p / put-01   :mode imperative  :ARG0 (y / you)  :ARG1 (b / block   :quant 1)  :location (a / apart   :mod

In [51]:
## Specify the path where the gold data is stored
folder_path = "gold/"
dataset_path = "dataset_thesis.txt"
all_merged_data = []
gesture_corpus_statistics = []

## Manually add the two empty sentences
manual_sentences = {
    ("p18_gold", 26.46): "doesn't matter",
    ("p30_gold", 32.84): "yeah",
}

for filename in os.listdir(folder_path):
    if filename.endswith(".txt") and filename.startswith("p"):
        file_path = os.path.join(folder_path, filename)
        extracted = proda.extract_data_from_txt(file_path)
        merge = proda.merge_speech_and_gesture_with_amr(extracted, filename=filename)

        ## Store all the gesture corpus sentences
        for item in merge:
            gesture_corpus_statistics.append(item)

        ## Remove sentences without gesture
        merge = [item for item in merge if item.get('gesture_amrs')]
        ## Remove the faulty sentence
        merge = [item for item in merge if item.get('sentence', '').strip().lower() != 'to']

        for item in merge:
            key = (item["file"], round(item["begin_time"], 2))
            ## Append the two missing sentences
            if not item.get("sentence") and key in manual_sentences:
                item["sentence"] = manual_sentences[key]

            # Lowercase all sentence text
            if item.get("sentence"):
                item["sentence"] = item["sentence"].lower()

            all_merged_data.append(item)
        print(all_merged_data)
        proda.create_dataset(merge, dataset_path, source_name=filename, mode="a")

[{'file': 'p15_gold', 'begin_time': 1.08, 'end_time': 2.32, 'sentence': 'all right put a block down', 'speech_amr': '(p / put-01   :mode imperative   :ARG0 (y / you)   :ARG1 (b / block    :quant 1     :direction (d / down))  :mod (a / all-right))', 'gesture_labels': ['RA: move, front left; RH: into claw, down;', 'body: still;'], 'gesture_amrs': ['(g / gesture-unit  :op1 (i / icon-GA   :ARG0 (s / signaler)       :ARG1 (p / put-01)       :ARG2 (a / actor))  :op2 (d / deixis-GA   :ARG0 s       :ARG1 (l / location)       :ARG2 a))']}, {'file': 'p15_gold', 'begin_time': 7.07, 'end_time': 10.36, 'sentence': 'put a block one block apart behind', 'speech_amr': '(p / put-01   :mode imperative  :ARG0 (y / you)  :ARG1 (b / block   :quant 1)  :location (a / apart   :mod (b2 / block    :quant 1))  :location (b3 / behind))', 'gesture_labels': ['LA: move, front, into gap, front; LH: into claw, down;', 'LA: move, down; LH: claw, down;', 'body: still;'], 'gesture_amrs': ['(a / and  :op2 (g / gesture-un

In [221]:
## Store the list containing the data to prompt Llama
with open("all_merged_data.pkl", "wb") as file:
    pickle.dump(all_merged_data, file)

## Statistics

In [160]:
## Gesture AMR corpus
icon_count = 0
deixis_count = 0
emblem_count = 0

for filename in os.listdir(folder_path):
    if filename.endswith(".txt") and filename.startswith("p"):
        file_path = os.path.join(folder_path, filename)
        extracted = proda.extract_data_from_txt(file_path)
        for item in extracted:
            if item['gesture_amr'] != "":
                icon_count += item['gesture_amr'].count("icon")
                deixis_count += item['gesture_amr'].count("deixis")
                emblem_count += item['gesture_amr'].count("emblem")

sent_count, sent_with_gestures, avg_length, _,_,_ = proda.count_statistics(gesture_corpus_statistics)
print(f"Number of sentences: {sent_count}")
print(f"Number of sentences with gestures: {sent_with_gestures}")
print(f"Average sentence length: {avg_length}")
print(f"Iconic gestures count: {icon_count}")
print(f"Deictic gestures count: {deixis_count}")
print(f"Emblematic gestures count: {emblem_count}")

print()

gesture_count = Counter()
for item in gesture_corpus_statistics:
    for label in item['gesture_labels']:
        gesture_labels = [g.strip() for g in label.split(';') if g.strip()]
        gesture_count.update(gesture_labels)

print(f"Total number of gestures: {gesture_count.total()}")
print("Gesture labels and their counts:")
for gesture, count in gesture_count.most_common():
    print(f"{gesture}: {count}")

Number of sentences: 342
Number of sentences with gestures: 247
Average sentence length: 8.08
Iconic gestures count: 245
Deictic gestures count: 127
Emblematic gestures count: 39

Total number of gestures: 1637
Gesture labels and their counts:
body: still: 199
Unknown: 155
RA: move, up: 53
arms: move, down: 47
RA: move, down: 39
head: rotate: 37
hands: into claw, down: 35
arms: move, up: 35
RA: move, front: 35
arms: apart, left: 32
LA: move, down: 23
head: nod: 22
RH: into open, down: 22
LA: move, up: 20
RA: move, back: 18
body: move, up: 18
arms: move, front: 17
hands: rotate: 17
body: move, down: 17
hands: claw, down: 16
arms: move, back: 16
arms: together, left: 15
RH: into point, front: 14
RA: move, right: 14
LA: move, front: 14
hands: into open, down: 14
hands: facing, closed: 14
RH: claw, down: 12
LA: move, back: 12
RH: into claw, down: 11
RA: move, left: 11
RH: into point, down: 11
LA: move, left: 11
LA: move, right: 11
arms: move, down back: 11
LH: into point, front: 10
hands: 

In [154]:
## SAGE-AMR corpus
sent_count, sent_with_gestures, avg_length, icon_count, deixis_count, emblem_count = proda.count_statistics(all_merged_data)
print(f"Number of sentences: {sent_count}")
print(f"Average sentence length: {avg_length}")
print(f"Iconic gestures count: {icon_count}")
print(f"Deictic gestures count: {deixis_count}")
print(f"Emblematic gestures count: {emblem_count}")

print()

print(f"Total number of gestures: {gesture_count.total()}")
print("Gesture labels and their counts:")
gesture_count = Counter()
for item in all_merged_data:
    for label in item['gesture_labels']:
        gesture_labels = [g.strip() for g in label.split(';') if g.strip()]
        gesture_count.update(gesture_labels)

for gesture, count in gesture_count.most_common():
    print(f"{gesture}: {count}")

Number of sentences: 246
Average sentence length: 8.11
Iconic gestures count: 308
Deictic gestures count: 153
Emblematic gestures count: 48

Total number of gestures: 1415
Gesture labels and their counts:
body: still: 139
Unknown: 119
RA: move, up: 52
arms: move, down: 42
RA: move, down: 36
arms: move, up: 34
RA: move, front: 33
arms: apart, left: 31
hands: into claw, down: 30
LA: move, down: 23
head: nod: 22
head: rotate: 22
LA: move, up: 19
RA: move, back: 18
RH: into open, down: 17
hands: claw, down: 16
RA: move, right: 14
hands: facing, closed: 14
arms: move, back: 14
arms: together, left: 14
hands: rotate: 14
body: move, down: 14
RH: into point, front: 13
arms: move, front: 13
LA: move, front: 13
hands: into open, down: 13
LA: move, back: 12
RH: into claw, down: 11
RH: claw, down: 11
RA: move, left: 11
RH: into point, down: 11
LA: move, left: 11
LA: move, right: 11
body: move, up: 10
hands: into closed, back: 9
LH: into point, front: 9
arms: shake, left: 9
hands: into point, down: