In [None]:
!pip install dill
!pip install nltk
!pip install flair

^C
[31mERROR: Operation cancelled by user[0m


In [None]:
# Insert code here.
import dill
import pickle 
import collections

from globalfn.annotations import all_annotations
from globalfn.annotations import annotation
from flair.data import Sentence

import nltk
nltk.download('framenet_v17')
from nltk.corpus import framenet as fn

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


In [None]:
# Basic Parser: output: {ID: (word, lemma, start_pos, pos): {framenet_lus.ID}}

In [None]:
def get_len_LU_tokens(anno):
    sent = Sentence(anno.text, use_tokenizer=True)
    starts = list()
    ends = list()
    for token in sent:
        starts.append(token.start_pos)
        ends.append(token.start_pos + len(token.text) - 1)
    
    max_len_LU_tokens = 0
    for start_pos, end_pos, _ in anno.lu_idx:
        len_LU_tokens = 0
        for start, end in list(zip(starts, ends)):
            if start_pos <= start and end <= end_pos:
                len_LU_tokens += 1
        max_len_LU_tokens = max(max_len_LU_tokens, len_LU_tokens)

        # if " " in anno.text[start_pos:end_pos+1]:
        #     print(anno.luName)
        #     print(anno.text[start_pos:end_pos+1])
        #     print(max_len_LU_tokens)
        
    return max_len_LU_tokens


def get_basic_parser_pred_true(lang):
    path = f"/home/jovyan/work/saved/results/{lang}_ID_to_LUs.pkl"
    result = pickle.load(open(path, 'rb'))
    pred_id_start_pos = collections.defaultdict(list)

    for ID, res in result.items():
        for word, lemma, start_pos, _ in res:
            pred_id_start_pos[ID].append((start_pos, 1))

    true_id_start_pos = collections.defaultdict(list)
    for ID, annos in all_annotations(lang).items():
        for anno in annos:
            max_len_LU_tokens = get_len_LU_tokens(anno)
            for start_pos, _, _ in anno.lu_idx:
                true_id_start_pos[ID].append((start_pos, max_len_LU_tokens))
            

    return pred_id_start_pos, true_id_start_pos


In [None]:
def eval_LU(pred, true, show_individual=False):
    """
    pred: {ID: [LU_pos, len_LU]}
    true: {ID: [(LU_pos, len_LU)]}
    """
    
    sum_prec = count_prec = 0
    sum_recall = count_recall = 0
    sum_f1 = count_f1 = 0

    for ID in true.keys():
        len_sent = len(Sentence(annotation(ID)[0].text, use_tokenizer=True))
        true_LU_pos_and_len = set(true[ID])
        pred_LU_pos_and_len = set(pred[ID])
        
        tp = fp = tn = fn = 0
        for LU_pos_and_len in pred_LU_pos_and_len:
            if LU_pos_and_len in true_LU_pos_and_len:
                tp += 1
            else:
                fp += 1
        
        for LU_pos_and_len in true_LU_pos_and_len:
            if LU_pos_and_len not in pred_LU_pos_and_len:
                fn += 1
        
        tn = (len_sent - len(true_LU_pos_and_len)) - fp
        
        if tp + fp == 0:
            prec = "n/a" # there were no positive cases in the input data
        else:
            prec = tp / (tp + fp)
            sum_prec += prec
            count_prec += 1
        
        if tp + fn == 0:
            recall = "n/a"
        else:
            recall = tp / (tp + fn)  # all instances were predicted as negative
            sum_recall += recall
            count_recall += 1

        if prec == "n/a" or recall == "n/a":
            f1 = "n/a"
        elif prec == 0 and recall == 0:
            f1 = 0
        else:
            f1 = 2*(prec*recall)/(prec + recall)
            sum_f1 += f1
            count_f1 += 1
        
        if show_individual:
            print(f"Precision:{prec}\tRecall:{recall}\tF1:{f1}")


    print("-------------------")
    print(f"Avg Precision: {sum_prec/count_prec:3f}\nAvg Recall: {sum_recall/count_recall:3f}\nF1: {sum_f1/count_f1:3f}")
    print("-------------------")

### Basic Parser

In [None]:
# lang = "pt"
# print(f"Language: {lang}")
# pred_id_start_pos, true_id_start_pos = get_basic_parser_pred_true(lang)
# eval_LU(pred_id_start_pos, true_id_start_pos)

# lang = "en"
# print(f"Language: {lang}")
# pred_id_start_pos, true_id_start_pos = get_basic_parser_pred_true(lang)
# eval_LU(pred_id_start_pos, true_id_start_pos)

# lang = "de"
# print(f"Language: {lang}")
# pred_id_start_pos, true_id_start_pos = get_basic_parser_pred_true(lang)
# eval_LU(pred_id_start_pos, true_id_start_pos)

In [None]:
def get_mod_1_pred_true(source_lang, target_lang, ngram_str):
    # {(source_sent_ID, target_sent_ID, source_LU, source_embedding): set[(potential LU, embedding, idx), (...)]}
    path = f"/home/jovyan/work/saved/results/modification_1/{source_lang}_{target_lang}_{ngram_str}_ID_to_LUs.pkl"
    result = pickle.load(open(path, 'rb'))

    ngram = 0
    if ngram_str == "unigram":
        ngram = 1
    elif ngram_str == "bigram":
        ngram = 2
    elif ngram_str == "trigram":
        ngram = 3

    pred_id_start_pos = collections.defaultdict(list)
    for key, val in result.items():
        source_sent_ID, target_sent_ID, source_LU, source_embedding = key
        for word, _, start_pos in val:
            # print(word, _, start_pos)
            pred_id_start_pos[target_sent_ID].append((start_pos, ngram))
    
    true_id_start_pos = collections.defaultdict(list)
    for ID, annos in all_annotations(target_lang).items():
        for anno in annos:
            max_len_LU_tokens = get_len_LU_tokens(anno)
            for start_pos, _, _ in anno.lu_idx:
                true_id_start_pos[ID].append((start_pos, max_len_LU_tokens))
                
    return pred_id_start_pos, true_id_start_pos

In [None]:
# print("de - unigram")
# pred_id_start_pos, true_id_start_pos = get_mod_1_pred_true("en", "de", "unigram")
# eval_LU(pred_id_start_pos, true_id_start_pos, show_individual=False)

# print("pt - unigram")
# pred_id_start_pos, true_id_start_pos = get_mod_1_pred_true("en", "pt", "unigram")
# eval_LU(pred_id_start_pos, true_id_start_pos, show_individual=False)

# print("pt - bigram")
# pred_id_start_pos, true_id_start_pos = get_mod_1_pred_true("en", "pt", "bigram")
# eval_LU(pred_id_start_pos, true_id_start_pos, show_individual=False)

# print("pt - trigram")
# pred_id_start_pos, true_id_start_pos = get_mod_1_pred_true("en", "pt", "trigram")
# eval_LU(pred_id_start_pos, true_id_start_pos, show_individual=False)

print("en - unigram")
pred_id_start_pos, true_id_start_pos = get_mod_1_pred_true("pt", "en", "unigram")
eval_LU(pred_id_start_pos, true_id_start_pos, show_individual=False)

print("en - bigram")
pred_id_start_pos, true_id_start_pos = get_mod_1_pred_true("pt", "en", "bigram")
eval_LU(pred_id_start_pos, true_id_start_pos, show_individual=False)

# print("en - trigram")
# pred_id_start_pos, true_id_start_pos = get_mod_1_pred_true("pt", "en", "trigram")
# eval_LU(pred_id_start_pos, true_id_start_pos, show_individual=False)  # error

en - unigram
-------------------
Avg Precision: 0.544060
Avg Recall: 0.717655
F1: 0.637062
-------------------
en - bigram
-------------------
Avg Precision: 0.001224
Avg Recall: 0.002789
F1: 0.209524
-------------------
