In [1]:
!pip install nltk

You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
from globalfn.alignments import all_alignments
from globalfn.annotations import annotation
from globalfn.full_text import full_text

import nltk
nltk.download("framenet_v17")
from nltk.corpus import framenet as fn

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


In [8]:
def compute_jaccard_similarity(set1, set2):
    """
    Compute Jaccard's similarity between two sets of semantic frames
    """
    intersection = len(set1.intersection(set2))
    union = (len(set1) + len(set2)) - intersection
    return intersection / union

def eval_exact_projection(src_lang, tgt_lang):
    """
    Evaluate how many frames can be directly transferred across aligned sentences.
    """
    # exact same semantic frame annotations
    count_exact = 0
    count_total = 0

    # jaccard similarity
    total_jac_sim = 0

    # count len_mismatch
    count_len_mismatch = 0

    for src_ids, tgt_ids in all_alignments(f"{src_lang}-{tgt_lang}").items():
        src_frames = set()
        tgt_frames = set()
        if type(src_ids) is int and annotation(src_ids) is not None:
            for anno in annotation(src_ids):
                src_frames.add(anno.frameName)
        elif type(src_ids) is list:
            for src_id in src_ids:
                if annotation(src_id):
                    for anno in annotation(src_id):
                        src_frames.add(anno.frameName)

        for tgt_id in tgt_ids:
            if annotation(tgt_id):
                for anno in annotation(tgt_id):
                    tgt_frames.add(anno.frameName)
        
        # evaluation
        if src_frames or tgt_frames:
            count_total += 1
            if src_frames == tgt_frames:
                count_exact += 1
            total_jac_sim += compute_jaccard_similarity(src_frames, tgt_frames)

            if len(src_frames) != len(tgt_frames):
                count_len_mismatch += 1
        

    print("Exact frames:", count_exact/count_total)
    print("Count Mismatched in Annotations Length:", count_len_mismatch/count_total)
    print("Jaccard's Similarity:", total_jac_sim/count_total)

In [5]:
eval_exact_projection("en", "pt")

1008 (language en) is not annotated.
737 (language pt) is not annotated.
1009 (language en) is not annotated.
738 (language pt) is not annotated.
1020 (language en) is not annotated.
1048 (language en) is not annotated.
1049 (language en) is not annotated.
1097 (language en) is not annotated.
1103 (language en) is not annotated.
1105 (language en) is not annotated.
861 (language pt) is not annotated.
1138 (language en) is not annotated.
1151 (language en) is not annotated.
882 (language pt) is not annotated.
1153 (language en) is not annotated.
1161 (language en) is not annotated.
1176 (language en) is not annotated.
1221 (language en) is not annotated.
1232 (language en) is not annotated.
966 (language pt) is not annotated.
Exact frames: 0.038461538461538464
Count Mismatched in Annotations Length: 0.8615384615384616
Jaccard's Similarity: 0.2834922744590226


In [9]:
eval_exact_projection("en", "de")

1008 (language en) is not annotated.
1009 (language en) is not annotated.
1020 (language en) is not annotated.
1048 (language en) is not annotated.
1049 (language en) is not annotated.
1312 (language de) is not annotated.
1067 (language en) is not annotated.
1338 (language de) is not annotated.
1339 (language de) is not annotated.
1340 (language de) is not annotated.
1341 (language de) is not annotated.
1342 (language de) is not annotated.
1343 (language de) is not annotated.
1344 (language de) is not annotated.
1345 (language de) is not annotated.
1346 (language de) is not annotated.
1347 (language de) is not annotated.
1348 (language de) is not annotated.
1349 (language de) is not annotated.
1350 (language de) is not annotated.
1351 (language de) is not annotated.
1352 (language de) is not annotated.
1353 (language de) is not annotated.
1354 (language de) is not annotated.
1354 (language de) is not annotated.
1355 (language de) is not annotated.
1356 (language de) is not annotated.
1