# Here the manually transcribed data is formatted to the same format as the generated data

Functions wich handles the formatting portion of the read.

In [71]:
import json

def retrieve_entities(sentence):
    # Filter the sentence structure from the file
    return list(filter(lambda x: len(x)  > 2 and "[" not in x and "]" not in x, sentence.split('"')))

def add_entities(ents, tag):
    # Add all incoming entities and add a correct tag to them.
    res = []
    for ent in ents:
        res.append([ent.lower(), tag])
    return res

def finalize_dialogue(moving, ipl, gpe, topics, sentences):
    sentences = sentences.lower()
    # Format the entities on to the correct form as the generated data. 
    entities = add_entities(moving, "COM") + add_entities(ipl, "IPL") + add_entities(gpe, "GPE")
    entity_labels = gen.get_labels(sentences, entities)

    topic_array = [False for i in range(len(gen.topic_categories))]
    topic_labels = dict(zip(gen.topic_categories, topic_array))
    for topic in topics:
        if topic_labels.get(topic, None) != None:
            topic_labels[topic] = True

    save_format = {
        "dialogue": sentences,
        "entities": entity_labels,
        "cats": [[key, topic_labels[key]] for key in topic_labels]
    }

    return json.dumps(save_format)

In [72]:
from modules.generator import Generator
gen = Generator(None, None)
res = []
with open("transcriptions.txt", "r") as f:
    moving_entities = []
    ipl = []
    gpe = []
    topics = []
    sentences = ""

    for line in f:
        split = line.split(":")

        if len(split) > 1:
            # Split into sentence and category
            cat, sentence = split
        else:
            # End of a dialogue
            dialogue = finalize_dialogue(moving_entities, ipl, gpe, topics, sentences)
            res.append(dialogue)

            # Reset the entities
            moving_entities = []
            ipl = []
            gpe = []
            topics = []
            sentences = ""
            continue
        
        # Add the correct element to correct entity or sentence structure
        if cat == "ENTITY":
            moving_entities += retrieve_entities(sentence)
        elif cat == "IPL":
            ipl += retrieve_entities(sentence)
        elif cat == "GPE":
            gpe += retrieve_entities(sentence)
        elif cat == "TOPIC":
            topics += retrieve_entities(sentence)
        else:
            sentences += " ".join(sentence.rsplit()) + " "

Saving the dialogues to a set file

In [73]:
with open("../Dataset/transcribed_dialogues.json", "w") as f:
    print(json.dumps(res[0]))
    f.write("\n".join(res))

"{\"dialogue\": \"vts oxel\\u00f6sund svart\\u00f6. svarte vts tv\\u00e5 tusen. anchorage away close of vinterklasen outbound for sea no pilot required. svarte anchor away proceeding outbound for sea. you have outbound vessle godafors approaching vinterklasen and inbound vessle stillmover at kr\\u00e4nkan s\\u00e4nd\\u00f6hook (sandihook) in korph\\u00e5let. it's okey well understood thank you. then you have inbound vessle white force queen inbound kr\\u00e4nkan. white force queen inbound approaching kr\\u00e4nkan. \", \"entities\": [[\"godafors\", 190, 198, \"COM\"], [\"stillmover\", 243, 253, \"COM\"], [\"svarte\", 22, 28, \"COM\"], [\"svarte\", 117, 123, \"COM\"], [\"svart\\u00f6\", 14, 20, \"COM\"], [\"vinterklasen\", 68, 80, \"IPL\"], [\"vinterklasen\", 211, 223, \"IPL\"], [\"vts oxel\\u00f6sund\", 0, 13, \"COM\"], [\"vts tv\\u00e5 tusen\", 29, 42, \"COM\"], [\"white force queen\", 367, 384, \"COM\"], [\"white force queen\", 402, 419, \"COM\"]], \"cats\": [[\"traffic information\"

Try loading the transcribed dataset

In [74]:
data = []
with open("../Dataset/transcribed_dialogues.json", "r") as f:
    for ind, line in enumerate(f):
        json_line = json.loads(line)
        data.append(json_line)

print("Transcribed sentences: ", len(data))

Transcribed sentences:  37
