In [27]:
# Insert code here.
!pip install dill
!pip install flair
!pip install nltk
!pip install networkx
!pip install matplotlib



In [28]:
from globalfn.annotations import all_annotations, annotation
from globalfn.alignments import all_alignments, aligned_with

import dill
import joblib  # for pickling sklearn
import pickle
import collections
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings

import nltk
nltk.download('framenet_v17')
from nltk.corpus import framenet as fn

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


In [29]:
# !ls /datasets/any-language-frame-semantics
# import zipfile
# with zipfile.ZipFile("/datasets/any-language-frame-semantics/extracted_Annos.zip", 'r') as zip_ref:
#     zip_ref.extractall("/datasets/any-language-frame-semantics/")

In [30]:
any_language_dataset = dill.load(open("/datasets/any-language-frame-semantics/el_out_eleni.dill.pkl", 'rb'))

### Basic (Based on Any-Language-Parser)

In [None]:
mbert = TransformerWordEmbeddings('bert-base-multilingual-cased')

In [None]:
frame_ID = collections.defaultdict(int)  # starts from 0
X = list()  # LU embeddings
y = list()  # frame ID
for anno in any_language_dataset:
    if anno.frame not in frame_ID:
        frame_ID[anno.frame] = len(frame_ID)
    sent = Sentence(' '.join(anno.text))
    mbert.embed(sent)
    X.append(sent[anno.LU_idx].embedding.to('cpu').numpy())
    y.append(frame_ID[anno.frame])
    # print(sent)
    # print(dir(anno))
    # print(anno.LU_idx)
    # print(anno.frame)
    # print(anno.text)
    # break
X = np.array(X)
y = np.array(y)

# clf = LogisticRegression(random_state=0, max_iter=1_000).fit(X, y)
# joblib.dump(clf, open("saved/models/any_language_classifier_mBERT.joblib", 'wb'))
clf = joblib.load(open("saved/models/any_language_classifier_mBERT.joblib", 'rb'))

In [None]:
def predict_frames(embedding_pkl, classifier_joblib, lang, frame_ID):
    """
    Output:
        - {sent_ID: [(word, frame), ...]}
    """
    # load classifier
    clf = joblib.load(open(classifier_joblib, 'rb'))

    # load embeddings
    sent_to_mbert_unigram = pickle.load(open(embedding_pkl, 'rb'))

    inv_frame_ID = {v: k for k, v in frame_ID.items()}
    ID_to_frames = dict()  # sent ID -> [predicted frame for each annotation]
    for sent_ID, annos in all_annotations(lang).items():
        preds = list()
        for anno in annos:
            sent = Sentence(anno.text, use_tokenizer=True)
            token_start_positions = [token.start_position for token in sent]
            token_idxs = [token.idx - 1 for token in sent]
            # get start and end index for the LU
            for start, end, _ in anno.lu_idx:
                if start not in token_start_positions:
                    # LU index is not right
                    print(sent_ID, start, end, anno.text[start:end+1], f'"{anno.text}"', token_start_positions, )
                    preds.append('')
                else:
                    idx = token_start_positions.index(start)
                    word, embedding = sent_to_mbert_unigram[sent_ID][idx]
                    X = [embedding.numpy()]
                    pred = clf.predict(X)[0]
                    preds.append((word, inv_frame_ID[pred]))
        ID_to_frames[sent_ID] = preds
    pickle.dump(ID_to_frames, open(f"saved/results/{lang}_ID_to_frames.pkl", 'wb'))

# pickle.dump(de_ID_to_frames, open("saved/results/de_ID-to_frames.pkl", 'wb'))

In [None]:
predict_frames("saved/embeddings/de_sent_to_mbert_unigram.pkl", 
               "saved/models/any_language_classifier_mBERT.joblib",
               'de',
               frame_ID)

1295 1 5 ragen "Fragen Sie sie nach ihrer Schulbildung, nageln sie Sie an die Wand." [0, 7, 11, 15, 20, 26, 38, 40, 47, 51, 55, 58, 62, 66]


In [None]:
predict_frames("saved/embeddings/pt_sent_to_mbert_unigram.pkl", 
               "saved/models/any_language_classifier_mBERT.joblib",
               'pt',
               frame_ID)

741 1 1 a "Na verdade, estou indo embora." [0, 3, 10, 12, 18, 23, 29]
834 1 1 a "Na verdade, nós moramos numa cidade chamada Snitterfield, na periferia de Stratford, que foi onde o pai do Shakespeare nasceu." [0, 3, 10, 12, 16, 24, 29, 36, 44, 56, 58, 61, 71, 74, 83, 85, 89, 93, 98, 100, 104, 107, 119, 125]
847 46 52 Ninguém "'Larga esse lápis.E para de falar desse jeito.Ninguém entende nada.'" [0, 1, 7, 12, 20, 25, 28, 34, 40, 54, 62, 66, 67]
847 18 18 E "'Larga esse lápis.E para de falar desse jeito.Ninguém entende nada.'" [0, 1, 7, 12, 20, 25, 28, 34, 40, 54, 62, 66, 67]
975 1 1 o "No final o médico sentou ao lado da Gillian e disse: 'Gillian, eu ouvi todas as coisas que sua mãe me disse, e eu preciso conversar a sós com ela.'" [0, 3, 9, 11, 18, 25, 28, 33, 36, 44, 46, 51, 53, 54, 61, 63, 66, 71, 77, 80, 87, 91, 95, 99, 102, 107, 109, 111, 114, 122, 132, 134, 138, 142, 145, 146]
976 37 39 Não "Ele disse: 'Espere aqui, já voltamos.Não vai demorar.',e eles deixaram ela sozinha." [0, 4

In [None]:
predict_frames("saved/embeddings/en_sent_to_mbert_unigram.pkl", 
               "saved/models/any_language_classifier_mBERT.joblib",
               'en',
               frame_ID)

1028 5 11 re like "They're like, 'Oh my God,' you know, 'Why me?'" [0, 4, 8, 12, 14, 15, 18, 21, 24, 27, 31, 35, 37, 38, 42, 44, 45]
1137 1 4 here "There isn't an education system on the planet that teaches dance every day to children the way we teach them mathematics." [0, 6, 8, 12, 15, 25, 32, 35, 39, 46, 51, 59, 65, 71, 75, 78, 87, 91, 95, 98, 104, 109, 120]
1157 1 6 here's "There's something curious about professors in my experience -- not all of them, but typically, they live in their heads." [0, 5, 8, 18, 26, 32, 43, 46, 49, 60, 63, 67, 71, 74, 78, 80, 84, 93, 95, 100, 105, 108, 114, 119]
1206 1 6 here's "There's a raft of research, but I know it from my personal life." [0, 5, 8, 10, 15, 18, 26, 28, 32, 34, 39, 42, 47, 50, 59, 63]
1262 66 69 mine "Our education system has mined our minds in the way that we strip-mine the earth: for a particular commodity." [0, 4, 14, 21, 25, 31, 35, 41, 44, 48, 52, 57, 60, 71, 75, 80, 82, 86, 88, 99, 108]
1265 1 4 here "There was a wonderful quot

In [None]:
pickle.load(open("saved/results/en_ID_to_frames.pkl", 'rb'))

{1010: [('great', 'Range')],
 1011: [('blown', 'Cardinal_numbers'),
  ('thing', 'Text'),
  ('whole', 'Cardinal_numbers')],
 1012: [('leaving', 'Locale_by_use')],
 1013: [('three', 'Cardinal_numbers'),
  ('themes', 'Text'),
  ('running', 'Time_vector'),
  ('talk', 'Information'),
  ('relevant', 'Time_vector'),
  ('want', 'Calendric_unit'),
  ('conference', 'Event')],
 1014: [('One', 'Cardinal_numbers'),
  ('extraordinary', 'Being_named'),
  ('presentations', 'Information'),
  ('evidence', 'Being_named'),
  ('people', 'People'),
  ('here', 'Time_vector'),
  ('creativity', 'Capability'),
  ('all', 'Degree'),
  ('all', 'Degree')],
 1015: [('variety', 'Kinship'), ('range', 'Kinship')],
 1016: [('put', 'Time_vector'),
  ('place', 'Calendric_unit'),
  ('idea', 'Text'),
  ('happen', 'Text'),
  ('second', 'Cardinal_numbers'),
  ('what', 'Degree'),
  ('where', 'Degree'),
  ('in', 'Cardinal_numbers'),
  ('future', 'Information'),
  ('no', 'Cardinal_numbers'),
  ('have', 'Cardinal_numbers')],
 101

### Modification 2: Reduce search space for semantic frames

In [31]:
G = nx.DiGraph()
for frame in fn.frames():
    for fr in frame.frameRelations:
        G.add_edge(fr.superFrame.name, fr.subFrame.name, fr=fr.type.name)
        assert "superFrame" in fr.keys()
        assert "subFrame" in fr.keys()
        assert "name" in fr.type.keys()

In [32]:
def get_subframes(G, root_frame, level=0):
    res = [root_frame]
    start = 0
    for _ in range(level):
        end = len(res)
        for i in range(start, end):
            res.extend(list(G.neighbors(res[i])))
        start = end
    return res

get_subframes(G, 'Apply_heat', level=1)

['Apply_heat', 'Absorb_heat', 'Cooking_creation', 'Soaking']

In [33]:
frame_ID = collections.defaultdict(int)  # starts from 0
for anno in any_language_dataset:
    if anno.frame not in frame_ID:
        frame_ID[anno.frame] = len(frame_ID)

In [34]:
def map_sent_to_LU_embeddings(lang, unigram_embedding_pkl_file):
    """
    input: 
        - language
        - embedding_pkl_file: {send_ID: [(word or n-gram tokens, embedding), ...]}
    output: {sent_ID: [(LU1 (surface form), embedding1), ...]}
    """
    sent_to_embeddings = pickle.load(open(unigram_embedding_pkl_file, 'rb'))
    sent_to_LU_embeddings = collections.defaultdict(list)
    for sent_ID, annos in all_annotations(lang).items():
        for anno in annos:
            sent = Sentence(anno.text, use_tokenizer=True)
            token_start_positions = [token.start_position for token in sent]
            token_end_positions = [token.start_position + len(token.text) - 1 for token in sent]
            token_idxs = [token.idx - 1 for token in sent]
            # get start and end index for the LU
            for start, end, _ in anno.lu_idx:
                if start not in token_start_positions:
                    # LU index is not right
                    # print(sent_ID, start, end, anno.text[start:end+1], f'"{anno.text}"', token_start_positions, )
                    ...
                
                elif end not in token_end_positions:
                    start_idx = token_start_positions.index(start)
                    word, embedding = sent_to_embeddings[sent_ID][start_idx]
                    sent_to_LU_embeddings[sent_ID].append((word, embedding))

                    # print(start, end, anno.text[start:end+1])
                    # print(' '.join([f"{anno.text}"[token_start_positions[i]: token_end_positions[i] + 1] for i in range(len(token_start_positions))]))
                    # print(f'"{anno.text}"')
                    # raise AssertionError
                else:
                    start_idx = token_start_positions.index(start)
                    end_idx = token_end_positions.index(end)

                    idx = start_idx
                    word, embedding = sent_to_embeddings[sent_ID][idx]
                    while idx != end_idx:
                        idx += 1
                        next_word, next_embedding = sent_to_embeddings[sent_ID][idx]
                        embedding.add_(next_embedding)
                        word += " " + next_word
                    sent_to_LU_embeddings[sent_ID].append((word, embedding))
    return sent_to_LU_embeddings

In [36]:
def helper_modification_2(clf, X, potential_frames, frame_ID):
    # probabilities for all frames
    frame_probabilities = np.array(clf.predict_proba(X)[0])

    # masking to restrict the search space
    # make non-potential frames to be 0
    indexes = np.array([frame_ID[frame] for frame in potential_frames])
    mask = np.zeros(frame_probabilities.shape, dtype=bool)
    mask[indexes] = True
    frame_probabilities[~mask] = 0


    return np.argmax(frame_probabilities)


def predict_frames_modification_2(LU_embedding, classifier_joblib, frame_ID, G, source_frame, level=0):
    # load classifier
    clf = joblib.load(open(classifier_joblib, 'rb'))

    # map ID to frames
    inv_frame_ID = {v: k for k, v in frame_ID.items()}
    
    # get potential frames (restrict search space)
    potential_frames = [frame for frame in get_subframes(G, source_frame, level=level) if frame in frame_ID]

    # predict target frame
    X = [LU_embedding.numpy()]
    target_frame_ID = helper_modification_2(clf, X, potential_frames, frame_ID)
    return inv_frame_ID[target_frame_ID]

# pickle.dump(de_ID_to_frames, open("saved/results/de_ID-to_frames.pkl", 'wb'))

In [37]:
# using gold LUs in target language

def frame_identification(source_lang, target_lang, reverse=0):
    """
    output:
        - {(source_sent_ID, target_sent_ID): set([(target_LU, source_frame, pred_frame)])}
    """
    sent_to_LU_embeddings = map_sent_to_LU_embeddings(target_lang if not reverse else source_lang, 
                                                    f'saved/embeddings/{target_lang if not reverse else source_lang}_sent_to_mbert_unigram.pkl') 

    ID_to_frame = collections.defaultdict(set)
    for source_ID, target in all_alignments(f'{source_lang}-{target_lang}')[reverse].items():
        if type(source_ID) is int:
            try:
                # get source frame
                for anno in annotation(source_ID):
                    source_frame = anno.frameName

                    # for all the aligned sentences, given the LU embedding and the source frame,
                    # predict frame (there can be many LUs so multiple frames)
                    for target_ID in target:
                        for LU, LU_embedding in sent_to_LU_embeddings[target_ID]:
                            pred_frame = predict_frames_modification_2(LU_embedding,
                                                                        "saved/models/any_language_classifier_mBERT.joblib",
                                                                        frame_ID,
                                                                        G,
                                                                        source_frame)
                            ID_to_frame[(source_ID, target_ID)].add((LU, source_frame, pred_frame))
            except:
                ...
    
    if not reverse:
        pickle.dump(ID_to_frame, open(f'saved/results/modification_2/{source_lang}_{target_lang}_ID_to_frames.pkl', 'wb'))
    else:
        pickle.dump(ID_to_frame, open(f'saved/results/modification_2/{target_lang}_{source_lang}_ID_to_frames.pkl', 'wb'))
    return ID_to_frame

In [38]:
frame_identification('en', 'pt', 0)
frame_identification('en', 'pt', 1)
frame_identification('en', 'de', 0)
frame_identification('en', 'de', 1)

defaultdict(set,
            {(1277, 1010): {('great', 'Desirability', 'Desirability')},
             (1282, 1016): {('future', 'Awareness', 'Awareness'),
              ('happen', 'Awareness', 'Awareness'),
              ('have', 'Awareness', 'Awareness'),
              ('idea', 'Awareness', 'Awareness'),
              ('in terms', 'Awareness', 'Awareness'),
              ('no', 'Awareness', 'Awareness'),
              ('place', 'Awareness', 'Awareness'),
              ('put', 'Awareness', 'Awareness'),
              ('second', 'Awareness', 'Awareness'),
              ('what', 'Awareness', 'Awareness'),
              ('where', 'Awareness', 'Awareness')},
             (1283, 1017): {('No', 'Awareness', 'Awareness'),
              ('No', 'Process_end', 'Process_end'),
              ('idea', 'Awareness', 'Awareness'),
              ('idea', 'Process_end', 'Process_end'),
              ('may', 'Awareness', 'Awareness'),
              ('may', 'Process_end', 'Process_end'),
              ('p

In [None]:
# # using predicted LUs in target language
# pickle.load(open('/home/jovyan/work/saved/results/modification_1/en_de_bigram_ID_to_LUs.pkl', 'rb'))