# Get relgrams from openie6. (v3)
26.04.2023

In [1]:
import dill as pickle
from collections import defaultdict
from tqdm import tqdm

import spacy
from spacy import displacy
spacy.prefer_gpu()

openie6_path = "/mount/studenten/arbeitsdaten-studenten1/shencg/BA/models/event_extraction/relational/openie6/"

OSError: libcudnn.so.7: cannot open shared object file: No such file or directory

In [None]:
nlp = spacy.load("en_core_web_trf")
nlp

### Get relgrams for goal sentences.

In [None]:
def get_word_ids(relgram, sentence, nlp):
    """
    Get the index of each word in the relgram.
    If the relgram contains multiple words, aggregate the index of each word in a list.
    For example, for the sentence "People hang an ironing board" and relgram ["People", "hang", "an ironing board"], 
    the words_ids = [[0], [1], [2, 3, 4]].
    """
    
    word_ids = []
    
    doc_sentence = nlp(sentence)
    sentence_words = [str(token) for token in doc_sentence]
#     print("SENTENCE_WORDS:", sentence_words)
    
    wid2word = dict(zip(range(len(sentence_words)), sentence_words))
#     print("wid2word:", wid2word, '\n')
    
    relgram_words = [string.split() for string in relgram]
#     print("RELGRAM in get_word_ids:", relgram)
#     print("RELGRAM_WORDS:", relgram_words)
    for words in relgram_words:
#         print("WORDS:", words)
        if len(words) == 1:
            if words[0] == "[PAD]":
                word_ids.append([-1])
            else:
                if words[0] in list(wid2word.values()):
                    wid = list(wid2word.keys())[list(wid2word.values()).index(words[0])]
                    word_ids.append([wid])
    #                 del wid2word[wid]
                    wid2word[wid] = ""
        elif len(words) > 1:
            word_ids_tmp = []
            for word in words:
#                 print("word:", word)
                if word in list(wid2word.values()):
#                     print(list(wid2word.values()).index(word))
                    wid = list(wid2word.keys())[list(wid2word.values()).index(word)]
                    word_ids_tmp.append(wid)
    #                 del wid2word[wid]
                    wid2word[wid] = ""
            word_ids.append(word_ids_tmp)
        else:
#             assert False
            continue
            
    return word_ids

In [None]:
def get_relgrams(predictions):
    fid2predictions = defaultdict()
    for i, item in enumerate(tqdm(predictions)):
        relgrams = []
        word_ids_list = []
        if item.startswith("fid:"):
            fid = item[4:-1]
            sentence = predictions[i+1].split("\n")[0][:-2]
            prediction = predictions[i+1].split('\n')[1:]
            for item in prediction:
                if len(item) > 0:
                    prediction = item[6:][1:-1]
                    if prediction.startswith("People"):
                        relgram = prediction.split("; ")
                        relgrams.append(relgram)

            for relgram in relgrams:
                doc = nlp(" ".join(relgram))
                for token in doc:
                    if token.dep_ == 'ROOT':
                        i_root = token.i
                    if token.dep_ == 'dobj' and token.head.i == i_root:
                        i_dobj = token.i
                        last_item = doc[i_root+1:i_dobj+1].text
                        relgram[-1] = last_item
                        break
                    if token.dep_ == 'prep' and token.head.i == i_root:
                        i_prep = token.i
                        last_item = doc[i_root+1:i_prep].text
                        if last_item == '':
                            relgram[-1] = "[PAD]"
                        else:
                            relgram[-1] = last_item


                # Get word_ids in each relgram.
                word_ids = get_word_ids(relgram, sentence, nlp)
                word_ids_list.append(word_ids)
            
            fid2predictions[fid] = [sentence, relgrams, word_ids_list]
    
    fid2predictions = dict(fid2predictions)
    print("Number of relgrams:", len(fid2predictions))   
    
    return fid2predictions

In [None]:
# Load predictions of openIE 6.
predictions = pickle.load(open(openie6_path + "Experiment/output/goals/predictions.p", "rb"))
for pred in predictions:
    if pred == "\n":
        predictions.remove(pred)
print("Number of predictions:", len(predictions), "\n")

In [None]:
fid2predictions = get_relgrams(predictions)

In [2]:
# Post-processing.
def post_processing(fid2predictions):
    for fid, pred in fid2predictions.items():
        for relgram, indices in zip(pred[1], pred[2]):
            if relgram[2] == '':
                print(relgram, indices)
                if len(relgram[1].split()) == 1:
                    doc = nlp(" ".join([relgram[0], relgram[1]]))
                    pos_list = [token.pos_ for token in doc]
                    if pos_list[1] == 'VERB':
                        relgram[2] = '[PAD]'
                        indices += [[-1]]
                    else:
                        pred[1].remove(relgram)
                        pred[2].remove(indices)
                else:
                    pred[1].remove(relgram)
                    pred[2].remove(indices)

        for relgram, indices in zip(pred[1], pred[2]):
            if relgram[2] == '':
                pred[1].remove(relgram)
                pred[2].remove(indices)
                
    return fid2predictions

In [None]:
fid2predictions = post_processing(fid2predictions)

In [None]:
# # Write the post-processed file.
# with open("fid2predictions.p", "wb") as file:
#     pickle.dump(fid2predictions, file)

### Get relgrams for step headlines.