In [1]:
import data
import run
import config
import os

import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.optim import AdamW
from collections import defaultdict
from transformers import get_linear_schedule_with_warmup

from torch import nn
from transformers import BertModel

In [9]:
import importlib
importlib.reload(data)

<module 'data' from '/workspace/opinion/src/05-opinion-mining/role-labeler/data.py'>

## Load data

In [10]:
mpqa2_docids = open(os.path.join(config.DATA_FOLDER, "database.mpqa.2.0/doclist.attitudeSubset")).read().splitlines()
mpqa3_docids = open(os.path.join(config.DATA_FOLDER, "database.mpqa.3.0/doclist")).read().splitlines()
train_docids = [docid for docid in mpqa2_docids if docid not in mpqa3_docids]
dev_docids = mpqa3_docids
print(len(train_docids), len(dev_docids))

train_dataset = data.RoleLabelerDataset(train_docids, mpqa2=True, ignore_negatives=True)
dev_dataset = data.RoleLabelerDataset(dev_docids, mpqa2=False, ignore_negatives=False)

436 70


creating data tensors: 100%|██████████| 436/436 [00:11<00:00, 38.72it/s]
creating data tensors: 100%|██████████| 70/70 [00:02<00:00, 29.75it/s]

torch.Size([32, 80]) torch.Size([32, 80]) torch.Size([32, 6]) torch.int64 torch.float32 torch.int64





In [38]:
for X_wordid, X_mask, Y in train_dataset.data:
    break

print(X_wordid.shape, X_mask.shape, Y.shape, X_wordid.dtype, X_mask.dtype, Y.dtype)

torch.Size([32, 80]) torch.Size([32, 80]) torch.Size([32, 6]) torch.int64 torch.float32 torch.int64


## Model

In [36]:
class RoleScorer(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, X):
        # X is tensor of shape batch-size x input-size

        A = self.fc1(X)
        A = torch.relu(A)
        # A is of shape batch-size x hidden-size

        B = self.fc2(A)
        # B is of shape batch-size x 1

        return B

class RoleLabeler(nn.Module):

    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams
        self.encoder = BertModel.from_pretrained(hparams.pretrained_model_name)
        
        enc_hidden_size = self.encoder.config.hidden_size
        self.holder_scorer = RoleScorer(2 * enc_hidden_size, hparams.scorer_hidden_size, 1)
        self.target_scorer = RoleScorer(2 * enc_hidden_size, hparams.scorer_hidden_size, 1)
        self.opinion_scorer = RoleScorer(4 * enc_hidden_size, hparams.scorer_hidden_size, 3)
    
    def forward(self, X_wordid, X_mask, Y=None):
        # X_wordid is the batch-size x max-seq-len matrix of WordPiece token indices
        # X_wordid[:,0] is always the [AUTHOR] token
        # X_mask is the batch-size x max-seq-len matrix of 0s and 1s.
        # X_mask = 0 for padded tokens, otherwise 1
        # 
        # Y is batch-size x 6 matrix
        # Y[i, 0] is the sentence id
        # (Y[i, 1], Y[i, 2]) is the holder expression span
        # (Y[i, 3], Y[i, 4]) is the target expression span
        # Y[i, 5] is the opinion label: 0 = +ve sentiment, 1 = -ve sentiment, 2 = other
        # 
        # IMPORTANT: THE FIRST TOKEN IS THE [AUTHOR] TOKEN

        device = self.hparams.device

        X_word = self.encoder(X_wordid, X_mask).last_hidden_state
        # X_word is of shape batch-size x max-seq-len x enc-hidden-size

        if Y is not None:

            holder_representations = []
            target_representations = []
            # holder representations will be a list of span representation of holders
            # similarly, for target representations 

            for i in range(X_word.shape[0]):

                holder_representation = torch.cat([ X_word[i, Y[i, 1]], X_word[i, Y[i, 2] - 1] ])
                target_representation = torch.cat([ X_word[i, Y[i, 3]], X_word[i, Y[i, 4] - 1] ])
                # holder (target) representation is of shape 2*enc-hidden-size

                holder_representations.append(holder_representation)
                target_representations.append(target_representation)
            
            holder_representations = torch.vstack(holder_representations)
            target_representations = torch.vstack(target_representations)
            opinion_representations = torch.hstack([holder_representations, target_representations])
            # holder-representations is of shape batch-size x 2*enc-hidden-size, same for target
            # opinion-representations is of shape batch-size x 4*enc-hidden-size

            holder_scores = self.holder_scorer(holder_representations)
            target_scores = self.target_scorer(target_representations)
            opinion_scores = self.opinion_scorer(opinion_representations)
            # holder-scores is of shape batch-size x 1, same for target-scores
            # opinion-scores is of shape batch-size x 3

            opinion_scores = opinion_scores + holder_scores + target_scores
            non_opinion_scores = torch.zeros((X_word.shape[0], 1), device=device)
            opinion_scores = torch.hstack([opinion_scores, non_opinion_scores])
            # opinion-scores is of shape batch-size x 4

            loss_function = nn.CrossEntropyLoss()
            loss = loss_function(opinion_scores, Y[:,-1])

            return loss
        
        else:

            holder_representations = []
            target_representations = []
            holder_pos = []
            target_pos = []
            holder_sentence_ids = []
            target_sentence_ids = []
            # holder-representations will be a list of span representation of holders
            # similarly for target-representations
            # holder-pos will be a list of list of 2 integers, defining the holder-span
            # similarly for target-pos
            # holder-sentence-ids will be a list of integers linking the holder representation to its sentence
            # target-sentence-ids will be a list of integers linking the target representation to its sentence

            for i in range(X_word.shape[0]):

                sentence_length = int(X_mask[i].sum())

                for span_length in range(1, self.hparams.max_holder_length + 1):

                    if span_length == 1:
                        sentence_start = 0
                    else:
                        sentence_start = 1

                    for span_start in range(sentence_start, sentence_length - span_length + 1):
                        
                        span_end = span_start + span_length
                        holder_representation = torch.cat([ X_word[i, span_start], X_word[i, span_end - 1] ])
                        holder_representations.append(holder_representation)
                        holder_pos.append([span_start, span_end])
                        holder_sentence_ids.append(i)
            
            for i in range(X_word.shape[0]):

                sentence_length = int(X_mask[i].sum())

                for span_length in range(1, self.hparams.max_target_length + 1):

                    for span_start in range(1, sentence_length - span_length + 1):

                        span_end = span_start + span_length
                        target_representation = torch.cat([ X_word[i, span_start], X_word[i, span_end - 1] ])
                        target_representations.append(target_representation)
                        target_pos.append([span_start, span_end])
                        target_sentence_ids.append(i)
            
            holder_representations = torch.vstack(holder_representations)
            target_representations = torch.vstack(target_representations)
            # holder-representations is of shape total-number-holders x 2*enc-hidden-size
            # target-representations is of shape total-number-targets x 2*enc-hidden-size
            # holder-sentence-ids is a list of size total-number-holders
            # target-sentence-ids is a list of size total-number-targets

            holder_scores = self.holder_scorer(holder_representations)
            target_scores = self.target_scorer(target_representations)
            # holder-scores is of shape total-number-holders x 1
            # target-scores is of shape total-number-targets x 1

            holder_pos = np.array(holder_pos)
            target_pos = np.array(target_pos)
            holder_sentence_ids = np.array(holder_sentence_ids)
            target_sentence_ids = np.array(target_sentence_ids)
            # holder-pos is numpy int array of shape total-number-holders x 2
            # target-pos is numpy int array of shape total-number-targets x 2
            # holder-sentence-ids is numpy int array of shape total-number-holders
            # target-sentence-ids is numpy int array of shape total-number-targets
            
            opinion_representations = []
            sentence_ids = []
            scores = []
            pos = []

            for i in range(X_word.shape[0]):

                sentence_holder_representations = holder_representations[holder_sentence_ids == i]
                sentence_target_representations = target_representations[target_sentence_ids == i]
                # sentence-holder-representations contains holder representations of sentence i
                # similarly sentence-target-representations
                # sentence-holder-representations is of shape sentence-number-holders x 2*enc-hidden-size
                # sentence-target-representations is of shape sentence-number-targets x 2*enc-hidden-size

                sentence_holder_scores = holder_scores[holder_sentence_ids == i].flatten()
                sentence_target_scores = target_scores[target_sentence_ids == i].flatten()
                # sentence-holder-scores is of shape sentence-number-holders
                # sentence-target-scores is of shape sentence-number-targets

                sentence_holder_pos = holder_pos[holder_sentence_ids == i]
                sentence_target_pos = target_pos[target_sentence_ids == i]
                # sentence-holder-pos is of shape sentence-number-holders x 2
                # sentence-target-pos is of shape sentence-number-targets x 2

                sentence_holder_scores_sort_index = torch.argsort(sentence_holder_scores, descending=True).cpu().numpy()
                sentence_target_scores_sort_index = torch.argsort(sentence_target_scores, descending=True).cpu().numpy()
                # sentence-holder-scores-sort-index is of shape sentence-number-holders
                # it contains holder indices in descending order of holder scores
                # similarly, sentence-target-scores-sort-index is of shape sentence-number-targets
                # it contains target indices in descending order of target scores

                if len(sentence_holder_scores) > 0 and len(sentence_target_scores) > 0:
                    # sentence has some holder and some target spans

                    sentence_top_holder_indices = sentence_holder_scores_sort_index[: self.hparams.n_holders]
                    sentence_top_target_indices = sentence_target_scores_sort_index[: self.hparams.n_targets]
                    
                    sentence_top_holder_representations = sentence_holder_representations[sentence_top_holder_indices]
                    sentence_top_target_representations = sentence_target_representations[sentence_top_target_indices]
                    sentence_top_holder_scores = sentence_holder_scores[sentence_top_holder_indices]
                    sentence_top_target_scores = sentence_target_scores[sentence_top_target_indices]
                    sentence_top_holder_pos = sentence_holder_pos[sentence_top_holder_indices]
                    sentence_top_target_pos = sentence_target_pos[sentence_top_target_indices]
                    # sentence-top-holder-representations is of shape sentence-number-top-holders x 2*enc-hidden-size
                    # sentence-top-holder-scores is of shape sentence-number-top-holders
                    # sentence-top-holder-pos is of shape sentence-number-top-holders x 2
                    # similarly for target (sentence-number-top-targets)
                    # sentence-number-top-holders <= hparams.n_holders, sentence-number-top-targets <= hparams.n_targets

                    for hi in range(sentence_top_holder_representations.shape[0]):
                        for ti in range(sentence_top_target_representations.shape[0]):

                            opinion_representations.append(torch.hstack([ sentence_top_holder_representations[hi], sentence_top_target_representations[ti] ]))
                            scores.append(sentence_top_holder_scores[hi] + sentence_top_target_scores[ti])
                            pos.append(sentence_top_holder_pos[hi].tolist() + sentence_top_target_pos[ti].tolist())
                            sentence_ids.append(i)
            
            opinion_representations = torch.vstack(opinion_representations)
            opinion_scores = self.opinion_scorer(opinion_representations)
            # opinion-representations is of shape num-opinions x 4*enc-hidden-size
            # opinion-scores is of shape num-opinions x 3

            scores = torch.Tensor(scores).reshape(-1, 1).to(config.device)
            # scores is of shape num-opinions x 1

            scores = scores + opinion_scores
            # scores is of shape num-opinions x 3

            Y_pred = []

            for i in range(scores.shape[0]):

                max_score = torch.max(scores[i])

                if max_score > 0:

                    opinion_label = torch.argmax(scores[i]).item()
                    Y_pred.append([sentence_ids[i]] + pos[i] + [opinion_label])
            
            return Y_pred

## Run

In [37]:
model = RoleLabeler(config)
model.to(config.device)
optimizer = AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataset))

Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model.train()
model.zero_grad()
loss = model(X_wordid, X_mask, Y)
loss.backward()
torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config.max_grad_norm)
optimizer.step()
scheduler.step()

In [39]:
sentenceid_to_pred_and_true = defaultdict(lambda: defaultdict(set))
model.eval()

with torch.no_grad():
        
    Y = Y.cpu().numpy()
    unique_index = []
    batch_dataset_sentenceids = set()
    batch_sentenceid_to_dataset_sentenceid = {}

    for i in range(len(Y)):
        
        if Y[i, 5] != -1:
            sentenceid_to_pred_and_true[Y[i, 0]]["true"].add(tuple(Y[i, 1:]))
        
        if Y[i, 0] not in batch_dataset_sentenceids:
            batch_sentenceid_to_dataset_sentenceid[len(unique_index)] = Y[i, 0]
            unique_index.append(i)
            batch_dataset_sentenceids.add(Y[i, 0])
    
    X_wordid = X_wordid[unique_index]
    X_mask = X_mask[unique_index]
    Ypred = model(X_wordid, X_mask)

    for y in Ypred:
        i = batch_sentenceid_to_dataset_sentenceid[y[0]]
        sentenceid_to_pred_and_true[i]["pred"].add(tuple(y[1:]))

In [40]:
sentenceid_to_pred_and_true

defaultdict(<function __main__.<lambda>()>,
            {4398: defaultdict(set,
                         {'true': {(38, 42, 51, 52, 0)},
                          'pred': {(9, 15, 1, 3, 1),
                           (9, 15, 2, 3, 1),
                           (9, 15, 2, 4, 2),
                           (9, 15, 8, 10, 1),
                           (9, 15, 9, 10, 1),
                           (12, 15, 1, 3, 1),
                           (12, 15, 2, 3, 0),
                           (12, 15, 2, 4, 2),
                           (12, 15, 8, 10, 1),
                           (12, 15, 9, 10, 0),
                           (31, 36, 1, 3, 1),
                           (31, 36, 2, 3, 2),
                           (31, 36, 2, 4, 2),
                           (31, 36, 8, 10, 1),
                           (31, 36, 9, 10, 1)}}),
             6511: defaultdict(set,
                         {'true': {(12, 14, 27, 28, 0)},
                          'pred': {(1, 4, 1, 2, 2),
                

In [41]:
df = run.evaluate(sentenceid_to_pred_and_true)

In [42]:
df

Unnamed: 0,binP,binR,proP,proR,excP,excR,binF,proF,excF
holder,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
target,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
frame-without-attitude,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
frame-with-attitude,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## MPQA2 folds from SRL4ORL

In [3]:
mpqa2_docids = open(os.path.join(config.DATA_FOLDER, "database.mpqa.2.0/doclist.attitudeSubset")).read().splitlines()

In [4]:
dev_docids = open(os.path.join(config.PROJECT_FOLDER, "baselines/02-naacl-mpqa-srl4orl/datasplit/new/filelist_dev")).read().splitlines()
test0_docids = open(os.path.join(config.PROJECT_FOLDER, "baselines/02-naacl-mpqa-srl4orl/datasplit/new/filelist_test0")).read().splitlines()
test1_docids = open(os.path.join(config.PROJECT_FOLDER, "baselines/02-naacl-mpqa-srl4orl/datasplit/new/filelist_test1")).read().splitlines()
test2_docids = open(os.path.join(config.PROJECT_FOLDER, "baselines/02-naacl-mpqa-srl4orl/datasplit/new/filelist_test2")).read().splitlines()
test3_docids = open(os.path.join(config.PROJECT_FOLDER, "baselines/02-naacl-mpqa-srl4orl/datasplit/new/filelist_test3")).read().splitlines()

In [5]:
len(dev_docids), len(test0_docids), len(test1_docids), len(test2_docids), len(test3_docids)

(100, 95, 95, 95, 95)

In [6]:
len(mpqa2_docids)

506

In [8]:
print(set(mpqa2_docids).issuperset(dev_docids))
print(set(mpqa2_docids).issuperset(test0_docids))
print(set(mpqa2_docids).issuperset(test1_docids))
print(set(mpqa2_docids).issuperset(test2_docids))
print(set(mpqa2_docids).issuperset(test3_docids))

True
True
True
True
True


In [9]:
set(mpqa2_docids).difference(dev_docids + test0_docids + test1_docids + test2_docids + test3_docids)

{'20011112/20.33.31-29984',
 '20020320/21.03.16-25474',
 'ula/20000410_nyt-NEW',
 'ula/20000415_apw_eng-NEW',
 'ula/20000416_xin_eng-NEW',
 'ula/20000419_apw_eng-NEW',
 'ula/20000420_xin_eng-NEW',
 'ula/20000424_nyt-NEW',
 'ula/20000815_AFP_ARB.0084.IBM-HA-NEW',
 'ula/602CZL285-1',
 'ula/A1.E1-NEW',
 'ula/A1.E2-NEW',
 'ula/AFGP-2002-600002-Trans',
 'ula/AFGP-2002-600045-Trans',
 'ula/AFGP-2002-600175-Trans',
 'ula/AFGP-2002-602187-Trans',
 'ula/CNN_AARONBROWN_ENG_20051101_215800.partial-NEW',
 'ula/CNN_ENG_20030614_173123.4-NEW-1',
 'ula/ENRON-pearson-email-25jul02',
 'ula/IZ-060316-01-Trans-1',
 'ula/SNO-525',
 'ula/enron-thread-159550',
 'ula/im_401b_e73i32c22_031705-2',
 'ula/sw2025-ms98-a-trans.ascii-1-NEW',
 'ula/wsj_1640.mrg-NEW',
 'ula/wsj_2465'}

In [10]:
recs = []

recs.extend([[docid, "dev"] for docid in dev_docids])
recs.extend([[docid, "test0"] for docid in test0_docids])
recs.extend([[docid, "test1"] for docid in test1_docids])
recs.extend([[docid, "test2"] for docid in test2_docids])
recs.extend([[docid, "test3"] for docid in test3_docids])

other_docids = set(mpqa2_docids).difference(dev_docids + test0_docids + test1_docids + test2_docids + test3_docids)

recs.extend([[docid, "other"] for docid in other_docids])

fold_df = pd.DataFrame(recs, columns=["docid","fold"])
fold_df.to_csv(os.path.join(config.RESULTS_FOLDER, "folds/mpqa2.5fold.csv"), index=False)

In [2]:
Y = torch.randn((3,4))

print(Y)

for i, y in enumerate(Y):
    print(i, y[2])

tensor([[-0.0731, -1.1629, -2.1805,  0.5698],
        [-3.2882, -0.8966, -2.5285,  1.2974],
        [-0.2989,  0.3250,  0.1124, -0.7649]])
0 tensor(-2.1805)
1 tensor(-2.5285)
2 tensor(0.1124)


In [7]:
torch.log_softmax(Y, dim=1)

tensor([[-1.2119, -2.3016, -3.3193, -0.5690],
        [-4.7196, -2.3280, -3.9600, -0.1341],
        [-1.6099, -0.9860, -1.1986, -2.0759]])

In [5]:
Y[]

tensor(-0.7407)