In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 3.4 MB/s eta 0:00:01
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434674 sha256=6d484156172974daf1f76e6f23fa6a29ba62a0116f5cf2d3b5277fc5eca19bf4
  Stored in directory: /home/jovyan/.cache/pip/wheels/45/6c/46/a1865e7ba706b3817f5d1b2ff7ce8996aabdd0d03d47ba0266
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.5
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# Insert code here.
import collections
import numpy as np
import fnmatch
import os

import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from pyattention.attention import attend
from tqdm import tqdm

from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings

import nltk
nltk.download("framenet_v17")
from nltk.corpus import framenet as fn

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/framenet_v17.zip.


In [None]:
# Load mBERT embeddings (last layer only)
mbert = TransformerWordEmbeddings('bert-base-multilingual-cased', layers='-1')

### 1. Train the Attention Layers with BFN 1.7

In [None]:
"""
Generate the training data for attention neural models for frame projection.
- lengths: store the sentence length for each sentence 
- frames: store the frame name for each sentence
- sentences: store word embeddings for each sentence 
- Y: store a sequence of label for frame-evoking LUs for each sentence.

Use torch.save to save the results of embeddings because we would encounter MemoryError if we 
embedded every sentence from BFN 1.7
"""
lengths = list()  # store the sentence length for each sentence 
frames = list()  # store the frame name for each sentence
sentences = list() # store word embeddings for each sentence 
Y = list()  # store a sequence of label for frame-evoking LUs for each sentence.

# iterate through all the LUs in BFN 1.7
for i, lu in enumerate(list(fn.lus())[:5000]):
    if i % 100 == 0:
        print(i)

    # if there's exemplar sentences
    if len(lu.exemplars) != 0:
        frame_name = lu.frame.name
        for sent in lu.exemplars:
            if "Target" not in sent.keys():
                continue

            # embed the word tokens in the sentences with MBERT
            sentence = Sentence(sent.text)
            mbert.embed(sentence)

            frames.append(frame_name)
            sentences.append(torch.stack([token.embedding for token in sentence]))
            lengths.append(len(sentence))

            # label if the word token is a frame-evoking LU
            y = list()
            for token in sentence:
                if (token.start_pos, token.end_pos) in sent.Target:
                    y.append(1)
                else:
                    y.append(0)
            Y.append(torch.tensor(y))

# torch.save(lengths, "lengths0.pt")
# torch.save(frames, "frames0.pt")
# torch.save(sentences, "sentences0.pt")
# torch.save(Y, "Y0.pt")

In [None]:
"""
Map each semantic frame to an integer.
"""

# frames = [frame.name for frame in fn.frames()]
# frames_to_idxs = {}
# for frame_name in frames:
#     if frame_name not in frames_to_idxs:
#         frames_to_idxs[frame_name] = len(frames_to_idxs)
# torch.save(frames_to_idxs, "frames_to_idxs.pt")

frames_to_idxs = torch.load("frames_to_idxs.pt")

In [3]:
"""
Load the saved training data.
"""
lengths = torch.load("lengths2.pt")
frames = torch.load("frames2.pt")
sentences = torch.load("sentences2.pt")
Y = torch.load("Y2.pt")

In [None]:
"""
Pad the sentences with 0s to ensure that all the sentences have the same length.
"""
# padding X and Y
MAX_LEN = 64
sentences = torch.stack([torch.cat([sent[:MAX_LEN], sent.new_zeros((MAX_LEN - sent[:MAX_LEN].size(0), sent[:MAX_LEN].size(1)))], 0) for sent in sentences], 0)
print(sentences.shape)

Y = torch.stack([torch.cat([y[:MAX_LEN], y.new_zeros((MAX_LEN - y[:MAX_LEN].size(0)))], 0) for y in Y], 0)
one_hot_Y = nn.functional.one_hot(Y)
print(one_hot_Y.shape)

In [None]:
idx = 147215  # start from 0, 74275, 147215
for i in range(len(lengths)):
    if i % 100 == 0:
        print(i)
    length, frame, sentence, y = lengths[i], frames[i], sentences[i], Y[i]
    torch.save((length, frame, sentence, y), f"bfn_data/{idx}.pt")
    idx += 1
print("New idx:", idx)

In [None]:
class FrameProjectionDataset(Dataset):
    """Frame Projection custom PyTorch dataset."""

    def __init__(self, root_dir, frames_to_idxs, MAX_LEN=64):
        """
        Args:
            root_dir (string): Directory with all the images.
            frames_to_idxs (dict): Maps frame name to an integer
            MAX_LEN (int): Maximum padding length.
        """
        self.root_dir = root_dir
        self.frames_to_idxs = frames_to_idxs
        self.MAX_LEN = MAX_LEN

    def __len__(self):
        return len(fnmatch.filter(os.listdir(self.root_dir), '*.pt'))

    def __getitem__(self, idx):
        length, frame, sent, y = torch.load(f"bfn_data/{idx}.pt")

        # Pad the sentences with 0s to ensure that all the sentences have the same length.
        sent = torch.cat([sent[:self.MAX_LEN], sent.new_zeros((self.MAX_LEN - sent[:self.MAX_LEN].size(0), sent[:self.MAX_LEN].size(1)))], 0)
        y = torch.cat([y[:self.MAX_LEN], y.new_zeros((self.MAX_LEN - y[:self.MAX_LEN].size(0)))], 0)

        # create one-hot label for each word token
        y_one_hot = nn.functional.one_hot(y, num_classes=2)

        # get the index for the semantic frame
        frame = self.frames_to_idxs[frame]
        sample = {'length': length, 'frame': frame, 'sentence': sent, 'y': y, 'y_one_hot': y_one_hot}
        return sample

In [None]:
class Attention(nn.Module):
    def __init__(self, len_frames, hidden_size, MAX_LEN=64):
        """
        Args:
            len_frames: Total number of frames
            hidden_size: Hidden embedding size for frames
            MAX_LEN: Maximum padding length.
        """
        super().__init__()
        self.MAX_LEN = MAX_LEN
        self.frame_embed = nn.Embedding(len_frames, hidden_size)
        self.linear = nn.Linear(hidden_size, 2)
    
    def forward(self, frame_idxs, sentences, lengths):
        """
        Output: Logits for binary sequence labels.
        """
        frame_embeddings = self.frame_embed(frame_idxs)  
        frame_embeddings = frame_embeddings.unsqueeze(1).expand(-1, self.MAX_LEN, -1) # (bz, MAX_LEN, frame_hidden_size)
        attn_weights = attend(frame_embeddings, sentences, context_sizes=lengths)
        return self.linear(attn_weights)

In [5]:
"""
Load the training dataset on FrameProjectionDataset and DataLoader.
"""
dataset = FrameProjectionDataset("/home/jovyan/work/bfn_data", frames_to_idxs)
dataloader = DataLoader(dataset, batch_size=256, shuffle=False)
for i_batch, sample_batched in enumerate(tqdm(dataloader)):
    sentences = sample_batched['sentence']
    frames_idxs = sample_batched['frame']
    lengths = sample_batched['length']
    one_hot_Y = sample_batched['y_one_hot']

In [None]:
"""
Train the attention neural model with BFN 1.7 data
"""
model = Attention(len(frames_to_idxs), 768).to('cuda')
criterion = torch.nn.BCEWithLogitsLoss(reduction='none')
optimizer = Adam(model.parameters(), lr=1e-4)
epoches = 20

min_loss = float('inf')
for e in range(epoches):
    total_train_loss = 0.
    for i_batch, sample_batched in enumerate(tqdm(dataloader)):
        sentences = sample_batched['sentence']
        frames_idxs = sample_batched['frame']
        lengths = sample_batched['length']
        one_hot_Y = sample_batched['y_one_hot']
        optimizer.zero_grad()
        output = model(frames_idxs.long().to('cuda'), sentences.to('cuda'), lengths)
        losses = criterion(output.float(), one_hot_Y.to('cuda').float())
        train_loss = 0.
        for i in range(losses.size(0)):
            loss = losses[i]
            train_loss += loss[:lengths[i], :].sum()
            # total_train_loss += loss.sum(lengths[i])
        train_loss.backward()
        optimizer.step()
        total_train_loss += train_loss.item()
    
    if total_train_loss < min_loss:
        min_loss = total_train_loss
        torch.save({
            'epoch': e,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': min_loss,
        }, "saved/model.pt")
    
    print(e, total_train_loss/len(dataset))

In [None]:
##### DEBUGGING
# dataset = FrameProjectionDataset("/home/jovyan/work/bfn_data", frames_to_idxs)
# print(dataset[242]['sentence'].shape, dataset[242]['y'].shape, dataset[242]['y_one_hot'].shape)
# print(dataset[243]['sentence'].shape, dataset[243]['y'].shape, dataset[243]['y_one_hot'].shape)

### 2. Frame Projection on GlobalFN

In [None]:
from globalfn.annotations import all_annotations, annotation
from globalfn.alignments import aligned_with

In [None]:
# Load M-BERT embedding model
mbert = TransformerWordEmbeddings('bert-base-multilingual-cased', layers='-1')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




In [None]:
"""
Load pre-trained model
"""
frames_to_idxs = torch.load("frames_to_idxs.pt")
model = Attention(len(frames_to_idxs), 768).to('cuda')
checkpoint = torch.load("saved/model.pt")
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()

Attention(
  (frame_embed): Embedding(1221, 768)
  (linear): Linear(in_features=768, out_features=2, bias=True)
)

In [None]:

def predict_on_globalfn(src_lang, tgt_lang, model, MAX_LEN=64):
    """
    Given the source language and target language, apply the pre-trained model 
    on the aligned sentences.

    # Output: {ID: {frame: sequence_labels}}
    """
    MAX_LEN = 64

    res = {}
    for src_ID, annos in all_annotations(src_lang).items():
        src_frames = [anno.frameName for anno in annos]
        if aligned_with(src_ID, tgt_lang)[1] is not None:
            for tgt_ID in aligned_with(src_ID, tgt_lang)[1]:
                res[tgt_ID] = collections.defaultdict(dict)
                if annotation(tgt_ID) is not None:
                    tgt_sent = Sentence(annotation(tgt_ID)[0].tokenized_text)
                    mbert.embed(tgt_sent)
                    
                    length = len(annotation(tgt_ID)[0].tokenized_lu_idx)
                    sent = torch.stack([token.embedding for token in tgt_sent])
                    sent = torch.cat([sent[:MAX_LEN], sent.new_zeros((MAX_LEN - sent[:MAX_LEN].size(0), sent[:MAX_LEN].size(1)))], 0)
                    sent = sent.unsqueeze(0)

                    for projected_frame in src_frames:
                        projected_frame_idx = torch.Tensor([frames_to_idxs[projected_frame]])
                        out = model(projected_frame_idx.long().to('cuda'), sent.to('cuda'), [length])
                        res[tgt_ID][projected_frame] = torch.argmax(out[0, :length, :], dim=1).cpu().numpy()
    torch.save(res, f"saved/projected_result_{src_lang}-{tgt_lang}.pt")
    return res

### 3. Predict frame-evoking Lexical Units given a semantic frame

In [None]:
res = predict_on_globalfn('en', 'pt', model)
res

861 (language pt) is not annotated.


{739: defaultdict(dict, {'Desirability': array([0, 0, 0, 0, 0, 0, 0])}),
 740: defaultdict(dict,
             {'Motion': array([0, 0, 0, 0, 0, 0, 0]),
              'Entity': array([0, 0, 0, 0, 0, 0, 0]),
              'Ranked_expectation': array([0, 0, 0, 0, 0, 0, 0])}),
 741: defaultdict(dict, {'Departing': array([0, 0, 0, 0, 0, 0, 0])}),
 742: defaultdict(dict,
             {'Cardinal_numbers': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Topic': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Fluidic_motion': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Discussion': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Being_relevant': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Desiring': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}),
 743: defaultdict(dict,
             {'Cardinal_numbers': array([0, 0, 0, 0, 0

In [None]:
res = predict_on_globalfn('pt', 'en', model)
res

1020 (language en) is not annotated.
1048 (language en) is not annotated.
1049 (language en) is not annotated.
1067 (language en) is not annotated.
1067 (language en) is not annotated.
1097 (language en) is not annotated.
1103 (language en) is not annotated.
1105 (language en) is not annotated.
1138 (language en) is not annotated.
1153 (language en) is not annotated.
1161 (language en) is not annotated.
1176 (language en) is not annotated.
1176 (language en) is not annotated.
1221 (language en) is not annotated.


{1010: defaultdict(dict, {'Desirability': array([0, 0, 0, 0, 0, 0, 0, 0, 0])}),
 1011: defaultdict(dict,
             {'Emotion_directed': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Entity': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Ranked_expectation': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}),
 1012: defaultdict(dict,
             {'Departing': array([0, 0, 0, 0, 0, 0, 0]),
              'Concessive': array([0, 0, 0, 0, 0, 0, 0])}),
 1013: defaultdict(dict,
             {'Existence': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Cardinal_numbers': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Topic': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Temporal_collocation': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Discussion': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Being

In [None]:
res = predict_on_globalfn('en', 'de', model)
res

1338 (language de) is not annotated.
1339 (language de) is not annotated.
1340 (language de) is not annotated.
1341 (language de) is not annotated.
1342 (language de) is not annotated.
1343 (language de) is not annotated.
1344 (language de) is not annotated.
1345 (language de) is not annotated.
1346 (language de) is not annotated.
1347 (language de) is not annotated.
1348 (language de) is not annotated.
1349 (language de) is not annotated.
1350 (language de) is not annotated.
1351 (language de) is not annotated.
1352 (language de) is not annotated.
1353 (language de) is not annotated.
1354 (language de) is not annotated.
1354 (language de) is not annotated.
1355 (language de) is not annotated.
1356 (language de) is not annotated.
1358 (language de) is not annotated.
1359 (language de) is not annotated.
1360 (language de) is not annotated.
1361 (language de) is not annotated.
1362 (language de) is not annotated.


{1277: defaultdict(dict, {'Desirability': array([0, 0, 0, 0, 0, 0, 0])}),
 1278: defaultdict(dict,
             {'Motion': array([0, 0, 0, 0, 0, 0, 0, 0]),
              'Entity': array([0, 0, 0, 0, 0, 0, 0, 0]),
              'Ranked_expectation': array([0, 0, 0, 0, 0, 0, 0, 0])}),
 1279: defaultdict(dict, {'Departing': array([0, 0, 0, 0, 0])}),
 1280: defaultdict(dict,
             {'Cardinal_numbers': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0]),
              'Topic': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0]),
              'Fluidic_motion': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0]),
              'Discussion': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0]),
              'Being_relevant': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        

In [None]:
res = predict_on_globalfn('de', 'en', model)
res

1008 (language en) is not annotated.
1009 (language en) is not annotated.
1020 (language en) is not annotated.
1048 (language en) is not annotated.
1067 (language en) is not annotated.


{1008: defaultdict(dict, {}),
 1009: defaultdict(dict, {}),
 1010: defaultdict(dict,
             {'Desirability': array([0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Be_in_agreement_on_assessment': array([0, 0, 0, 0, 0, 0, 0, 0, 0])}),
 1011: defaultdict(dict,
             {'Entity': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Ranked_expectation': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Emotion_directed': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}),
 1012: defaultdict(dict, {'Departing': array([0, 0, 0, 0, 0, 0, 0])}),
 1013: defaultdict(dict,
             {'Being_relevant': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Desiring': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Text_creation': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Existence': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
              'Topic': array([0, 0, 0,