In [None]:
import collections
import numpy as np
from scipy.spatial import distance

from globalfn.alignments import all_alignments, aligned_with
from globalfn.annotations import annotation, all_annotations

import flair
from flair.data import Sentence

### Step 1: Assume that LU can be directly projected

Output: a dictionary of projected LUs and their respective embeddings

In [None]:
from flair.embeddings import TransformerWordEmbeddings
mbert = TransformerWordEmbeddings('bert-base-multilingual-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




In [None]:
def generate_embeddings(lang):
    """
    Outputs:
    # ID_to_embeddings - {sentence ID: mBERT word embeddings}
    # ID_to_LU_embeddings - {sentence ID: [(word_idx, LU, embedding, frame)]}
    """
    ID_to_embeddings = {}
    ID_to_LU_embeddings = collections.defaultdict(list)
    for sent_ID, annos in all_annotations(lang).items():
        tokenized_text = Sentence(annos[0].tokenized_text)
        mbert.embed(tokenized_text)
        embeddings = [token.embedding.cpu().numpy() for token in tokenized_text]
        ID_to_embeddings[sent_ID] = embeddings
        for anno in annos:
            for i, lu in enumerate(anno.tokenized_lu_idx):
                if lu is not "-":
                    ID_to_LU_embeddings[sent_ID].append((i, lu, embeddings[i], anno.frameName))

    return ID_to_embeddings, ID_to_LU_embeddings

en_ID_to_embeddings, en_ID_to_LU_embeddings = generate_embeddings('en')
pt_ID_to_embeddings, pt_ID_to_LU_embeddings = generate_embeddings('pt')

In [None]:
def find_projected_embeddings(src_ID_to_LU_embeddings, tgt_ID_to_embeddings, tgt_lang):
    """
    Outputs:
    # ID_to_projected_LU_embeddings - {sentence ID: [(word_idx, projected word, embedding)]}
    """
    ID_to_projected_LU_embeddings = collections.defaultdict(list)
    for src_ID in src_ID_to_LU_embeddings.keys():
        src_IDs, tgt_IDs = aligned_with(src_ID, tgt_lang)
        for _, _, src_LU_embedding, _ in src_ID_to_LU_embeddings[src_ID]:
            for tgt_ID in tgt_IDs:
                if tgt_ID in tgt_ID_to_embeddings:
                    # word embeddings for the word token in the sentence
                    tgt_word_embeddings = tgt_ID_to_embeddings[tgt_ID]

                    # find the closest LU with respect to cosine similarity
                    distances = distance.cdist([src_LU_embedding], tgt_word_embeddings, "cosine")[0]
                    min_index = np.argmin(distances)

                    projected_word = annotation(tgt_ID)[0].tokenized_text.split(' ')[min_index]
                    ID_to_projected_LU_embeddings[tgt_ID].append((min_index, projected_word, tgt_word_embeddings[min_index]))
    return ID_to_projected_LU_embeddings

pt_ID_to_projected_LU_embeddings = find_projected_embeddings(en_ID_to_LU_embeddings, pt_ID_to_embeddings, 'pt')
en_ID_to_projected_LU_embeddings = find_projected_embeddings(pt_ID_to_LU_embeddings, en_ID_to_embeddings, 'en')

In [None]:
print(pt_ID_to_projected_LU_embeddings.keys())
print(en_ID_to_projected_LU_embeddings.keys())

dict_keys([739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 831, 832, 833, 834, 835, 837, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 862, 863, 864, 865, 866, 867, 868, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 883, 885, 886, 887, 888, 889, 890, 891, 892, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949,

### Step 2: SDEC-AD Semantic Frame Induction

In [None]:
import pickle
from SDEC.models.SDEC_AD import DeepEmbeddingClustering
import nltk
nltk.download('stopwords')
nltk.download('framenet_v17')
from nltk.corpus import framenet as fn
from nltk.corpus import stopwords

fn_lu_embedding_filename = "lus_fn1.7_definition_bert.p"
fn_L = pickle.load(open(fn_lu_embedding_filename, 'rb'))
frames_to_int = {}
int_to_frames = {}
for lu_id in fn_L.keys():
    frame_name = fn.lu(lu_id).frame.name
    if frame_name not in frames_to_int:
        int_to_frames[len(frames_to_int)] = frame_name
        frames_to_int[frame_name] = len(frames_to_int)

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/framenet_v17.zip.


In [None]:
def SDEC_frame_induction(ID_to_projected_LU_embeddings, ID_to_LU_embeddings, SDEC_trained_weights=None):
    """
    Use SDEC to induce semantic frames from the embeddings of the projected lexical units.
    Outputs:
    # - sent_ID_SDEC_induced_frames: a dictionary of sentence IDs to the set of predicted semantic frames
    # - sent_ID_annotated_frames: a dictionary of sentence IDs to the set of annotated semantic frames
    """
    X = list()
    ids = list()
    for key in range(min(ID_to_projected_LU_embeddings.keys()), max(ID_to_projected_LU_embeddings.keys()) + 1):
        for _, _, embedding in ID_to_projected_LU_embeddings[key]:
            X.append(embedding)
            ids.append(key)
    
    X = np.array(X)
    c = DeepEmbeddingClustering(n_clusters=len(int_to_frames),
                                input_dim=3072,
                                encoders_dims=[7500, 1000])
    pred_Y = c.predict(X, SDEC_trained_weights=SDEC_trained_weights)
    sent_ID_SDEC_induced_frames = collections.defaultdict(set)
    for i in range(len(ids)):
        sent_ID_SDEC_induced_frames[ids[i]].add(int_to_frames[pred_Y[i]])

    sent_ID_annotated_frames = collections.defaultdict(set)
    for id in ID_to_LU_embeddings:
        for _, _, _, frame in ID_to_LU_embeddings[id]:
            if frame not in frames_to_int:
                print(frame)
            sent_ID_annotated_frames[id].add(frame)
    return sent_ID_SDEC_induced_frames, sent_ID_annotated_frames

In [None]:
"""semantic frame induction for PT (whose LUs are projected from EN)"""
pt_pred, pt_true = SDEC_frame_induction(pt_ID_to_projected_LU_embeddings, pt_ID_to_LU_embeddings, "/home/jovyan/work/SDEC-trained/SDEC_AD_bcubed_fscore_0.61633.h5")

Obligation_scenario
Physical_entity
Asymmetric_reciprocality
Asymmetric_reciprocality
Asymmetric_reciprocality


In [None]:
"""semantic frame induction for EN (whose LUs are projected from PT)"""
en_pred, en_true = SDEC_frame_induction(en_ID_to_projected_LU_embeddings, en_ID_to_LU_embeddings, "/home/jovyan/work/SDEC-trained/SDEC_AD_bcubed_fscore_0.61633.h5")

### Step 3: Evaluation of Semantic Frames Induction

In [None]:
frames = set()
for frame in fn.frames():
    frames.add(frame.name)

In [None]:
def evaluate(pred, true, frames=frames, show_individual=False, print_result=True):
    """
    Evaluate the precision, recall, and F1-scores of the predicted frames (`pred`) 
    and the actual frames (`true`).
    """
    sum_prec = count_prec = 0
    sum_recall = count_recall = 0
    sum_f1 = count_f1 = 0

    for ID in true.keys():
        tp = fp = tn = fn = 0
        for true_frame in true[ID]:
            if true_frame in pred[ID]:
                tp += 1
            else:
                fn += 1
        
        for pred_frame in pred[ID]:
            if pred_frame not in true[ID]:
                fp += 1
        
        tn = len(frames) - len(true[ID]) - fp
        if tp + fp == 0:
            prec = "n/a" # there were no positive cases in the input data
        else:
            prec = tp / (tp + fp)
            sum_prec += prec
            count_prec += 1

        if tp + fn == 0:
            recall = "n/a"
        else:
            recall = tp / (tp + fn)  # all instances were predicted as negative
            sum_recall += recall
            count_recall += 1

        if prec == "n/a" or recall == "n/a":
            f1 = "n/a"
        elif prec == 0 and recall == 0:
            f1 = 0
        else:
            f1 = 2*(prec*recall)/(prec + recall)
            sum_f1 += f1
            count_f1 += 1
        
        if show_individual:
            print(f"Precision:{prec}\tRecall:{recall}\tF1:{f1}")
    
    if print_result:
        print("-------------------")
        print(f"Avg Precision: {sum_prec/count_prec:3f}\nAvg Recall: {sum_recall/count_recall:3f}\nF1: {sum_f1/count_f1:3f}")
        print("-------------------")
    return sum_f1/count_f1

In [None]:
evaluate(pt_pred, pt_true)

-------------------
Avg Precision: 0.323171
Avg Recall: 0.218637
F1: 0.387683
-------------------


0.38768287999515677

In [None]:
evaluate(en_pred, en_true)

-------------------
Avg Precision: 0.271881
Avg Recall: 0.355575
F1: 0.408168
-------------------


0.4081684919460366