In [11]:
from collections import Counter, defaultdict

In [1]:
import json
import numpy as np
import config
import pandas as pd
import os
from tqdm import tqdm, trange
from transformers import BertTokenizer
import torch

def tokenize_and_preserve_labels(tokens, tokenizer):
    cumul = np.zeros(len(tokens) + 1, dtype=int)
    ttokens = []
    for i, token in enumerate(tokens):
        tt = tokenizer.tokenize(token)
        ttokens.extend(tt)
        cumul[i + 1] = cumul[i] + len(tt)
    return ttokens, cumul

def correct_span(span, cumul):
    return [cumul[span[0]], cumul[span[1] + 1]]

def correct_label_arr(label_arr, cumul):
    new_label_arr = np.full(cumul[-1], "O")
    for i in range(len(label_arr)):
        if label_arr[i] == "B":
            new_label_arr[cumul[i]] = "B"
            new_label_arr[cumul[i] + 1: cumul[i + 1]] = "I"
        elif label_arr[i] == "I":
            new_label_arr[cumul[i]: cumul[i + 1]] = "I"
    return new_label_arr.tolist()

def encode_attitude(attitude_type):
    if attitude_type == "sentiment-pos":
        return 1
    elif attitude_type == "sentiment-neg":
        return 2
    else:
        return 3

def create_label_dict(label_tuples, cumul):
    label_dict = dict(target_to_label={}, holder_and_target_to_label={})
    holder_sizes, target_entity_sizes, target_event_sizes = [], [], []
    n_implicit_opinion_tuples, n_explicit_opinion_tuples = 0, 0
    for ex in label_tuples:
        if ex["target-type"] in ["entity","event"]:
            target_span = correct_span(ex["target"], cumul)
            if ex["target-type"] == "entity":
                target_entity_sizes.append(target_span[1] - target_span[0])
            else:
                target_event_sizes.append(target_span[1] - target_span[0])
            attitude_label = encode_attitude(ex["attitude-type"])
            if ex["holder-type"] == "span":
                holder_span = correct_span(ex["holder"], cumul)
                holder_sizes.append(holder_span[1] - holder_span[0])
                label_dict["holder_and_target_to_label"][tuple(holder_span + target_span)] = attitude_label
                n_explicit_opinion_tuples += 1
            else:
                if ex["holder-type"] == "implicit":
                    attitude_label += 3
                label_dict["target_to_label"][tuple(target_span)] = attitude_label
                n_implicit_opinion_tuples += 1
    return label_dict, holder_sizes, target_entity_sizes, target_event_sizes, n_implicit_opinion_tuples, n_explicit_opinion_tuples

fold_file = os.path.join(config.RESULTS_FOLDER, "mpqa3/5fold.csv")
fold_df = pd.read_csv(fold_file, index_col=None)
n_folds = fold_df["fold"].max() + 1
max_seq_len = -1000
label_to_label_id = {"O":0, "B":1, "I":2}
holder_sizes, entity_sizes, event_sizes = [], [], []
n_implicit_opinion_tuples_per_sentence, n_explicit_opinion_tuples_per_sentence = [], []

fold_tokens = [[] for _ in range(n_folds)]
fold_lens = [[] for _ in range(n_folds)]
fold_opinion_labels = [[] for _ in range(n_folds)]
fold_holder_labels = [[] for _ in range(n_folds)]
fold_entity_labels = [[] for _ in range(n_folds)]
fold_event_labels = [[] for _ in range(n_folds)]
fold_label_tuples = [[] for _ in range(n_folds)]
fold_token_indices = [None for _ in range(n_folds)]
fold_masks = [None for _ in range(n_folds)]
fold_opinion_label_indices = [None for _ in range(n_folds)]
fold_holder_label_indices = [None for _ in range(n_folds)]
fold_entity_label_indices = [None for _ in range(n_folds)]
fold_event_label_indices = [None for _ in range(n_folds)]

print("loading {} Tokenizer".format(config.pretrained_model_name))
tokenizer = BertTokenizer.from_pretrained(config.pretrained_model_name)

special_token_to_indices = {}
special_token_to_indices["CLS"] = tokenizer.vocab["[CLS]"]
special_token_to_indices["SEP"] = tokenizer.vocab["[SEP]"]
special_token_to_indices["PAD"] = tokenizer.vocab["[PAD]"]
special_token_to_indices["HOLDER"] = tokenizer.vocab["[HOLDER]"] = tokenizer.vocab.pop("[unused1]")
special_token_to_indices["TARGET"] = tokenizer.vocab["[TARGET]"] = tokenizer.vocab.pop("[unused2]")
special_token_to_indices["OPINION"] = tokenizer.vocab["[OPINION]"] = tokenizer.vocab.pop("[unused3]")

for _, row in tqdm(fold_df.iterrows(), total=len(fold_df), desc="reading mpqa3"):
    doc_file = os.path.join(config.MPQA3_PROCESSED_FOLDER, row["doc_id"], "tokenized.json")
    i = row["fold"]
    doc = json.load(open(doc_file))
    for sentence in doc:
        tokens, cumul = tokenize_and_preserve_labels(sentence["tokens"], tokenizer)
        opinion_labels = correct_label_arr(sentence["dse-opinion"], cumul)
        holder_labels = correct_label_arr(sentence["dse-holder"], cumul)
        entity_labels = correct_label_arr(sentence["dse-entity"], cumul)
        event_labels = correct_label_arr(sentence["dse-event"], cumul)
        label_dict, sentence_holder_sizes, sentence_entity_sizes, sentence_event_sizes, n_implicit_opinion_tuples, n_explicit_opinion_tuples = create_label_dict(sentence["dse"], cumul)
        fold_tokens[i].append(tokens)
        fold_lens[i].append(len(tokens))
        fold_opinion_labels[i].append(opinion_labels)
        fold_holder_labels[i].append(holder_labels)
        fold_entity_labels[i].append(entity_labels)
        fold_event_labels[i].append(event_labels)
        fold_label_tuples[i].append(label_dict)
        max_seq_len = max(max_seq_len, len(tokens))
        holder_sizes.extend(sentence_holder_sizes)
        entity_sizes.extend(sentence_entity_sizes)
        event_sizes.extend(sentence_event_sizes)
        n_implicit_opinion_tuples_per_sentence.append(n_implicit_opinion_tuples)
        n_explicit_opinion_tuples_per_sentence.append(n_explicit_opinion_tuples)

print("max seq length = {}".format(max_seq_len))
print("holder span size                         : 90%tile = {:.1f}, 95%tile = {:.1f}, 99%tile = {:.1f}, max = {:.1f}".format( np.quantile(holder_sizes, 0.95), np.quantile(holder_sizes, 0.95), np.quantile(holder_sizes, 0.99), np.max(holder_sizes) ))
print("entity span size                         : 90%tile = {:.1f}, 95%tile = {:.1f}, 99%tile = {:.1f}, max = {:.1f}".format( np.quantile(entity_sizes, 0.95), np.quantile(entity_sizes, 0.95), np.quantile(entity_sizes, 0.99), np.max(entity_sizes) ))
print("event span size                          : 90%tile = {:.1f}, 95%tile = {:.1f}, 99%tile = {:.1f}, max = {:.1f}".format( np.quantile(event_sizes, 0.95), np.quantile(event_sizes, 0.95), np.quantile(event_sizes, 0.99), np.max(event_sizes) ))
print("num of implicit opinion tuples per sent  : 90%tile = {:.1f}, 95%tile = {:.1f}, 99%tile = {:.1f}, max = {:.1f}".format( np.quantile(n_implicit_opinion_tuples_per_sentence, 0.95), np.quantile(n_implicit_opinion_tuples_per_sentence, 0.95), np.quantile(n_implicit_opinion_tuples_per_sentence, 0.99), np.max(n_implicit_opinion_tuples_per_sentence) ))
print("num of explicit opinion tuples per sent  : 90%tile = {:.1f}, 95%tile = {:.1f}, 99%tile = {:.1f}, max = {:.1f}".format( np.quantile(n_explicit_opinion_tuples_per_sentence, 0.95), np.quantile(n_explicit_opinion_tuples_per_sentence, 0.95), np.quantile(n_explicit_opinion_tuples_per_sentence, 0.99), np.max(n_explicit_opinion_tuples_per_sentence) ))

for i in trange(n_folds, desc="creating tensors"):
    index = np.argsort(fold_lens[i])
    fold_tokens[i] = [fold_tokens[i][j] for j in index]
    fold_lens[i] = sorted(fold_lens[i])
    fold_opinion_labels[i] = [fold_opinion_labels[i][j] for j in index]
    fold_holder_labels[i] = [fold_holder_labels[i][j] for j in index]
    fold_entity_labels[i] = [fold_entity_labels[i][j] for j in index]
    fold_event_labels[i] = [fold_event_labels[i][j] for j in index]
    fold_label_tuples[i] = [fold_label_tuples[i][j] for j in index]
    fold_token_indices[i] = torch.zeros((len(fold_tokens[i]), max_seq_len), dtype=torch.long, device=config.device)
    fold_masks[i] = torch.zeros((len(fold_tokens[i]), max_seq_len), dtype=torch.long, device=config.device)
    fold_opinion_label_indices[i] = torch.zeros((len(fold_tokens[i]), max_seq_len), dtype=torch.long, device=config.device)
    fold_holder_label_indices[i] = torch.zeros((len(fold_tokens[i]), max_seq_len), dtype=torch.long, device=config.device)
    fold_entity_label_indices[i] = torch.zeros((len(fold_tokens[i]), max_seq_len), dtype=torch.long, device=config.device)
    fold_event_label_indices[i] = torch.zeros((len(fold_tokens[i]), max_seq_len), dtype=torch.long, device=config.device)
    for j in range(len(fold_tokens[i])):
        length = len(fold_tokens[i][j])
        fold_token_indices[i][j][:length] = torch.LongTensor(tokenizer.convert_tokens_to_ids(fold_tokens[i][j]))
        fold_masks[i][j][:length] = 1.
        fold_opinion_label_indices[i][j][:length] = torch.LongTensor([label_to_label_id[label] for label in fold_opinion_labels[i][j]])
        fold_holder_label_indices[i][j][:length] = torch.LongTensor([label_to_label_id[label] for label in fold_holder_labels[i][j]])
        fold_entity_label_indices[i][j][:length] = torch.LongTensor([label_to_label_id[label] for label in fold_entity_labels[i][j]])
        fold_event_label_indices[i][j][:length] = torch.LongTensor([label_to_label_id[label] for label in fold_event_labels[i][j]])

loading bert-base-cased Tokenizer


reading mpqa3: 100%|██████████| 70/70 [00:01<00:00, 42.39it/s]
creating tensors:   0%|          | 0/5 [00:00<?, ?it/s]

max seq length = 88
holder span size                         : 90%tile = 8.0, 95%tile = 8.0, 99%tile = 13.0, max = 31.0
entity span size                         : 90%tile = 3.0, 95%tile = 3.0, 99%tile = 4.8, max = 6.0
event span size                          : 90%tile = 3.0, 95%tile = 3.0, 99%tile = 4.0, max = 5.0
num of implicit opinion tuples per sent  : 90%tile = 4.0, 95%tile = 4.0, 99%tile = 7.0, max = 18.0
num of explicit opinion tuples per sent  : 90%tile = 6.0, 95%tile = 6.0, 99%tile = 11.0, max = 18.0


creating tensors: 100%|██████████| 5/5 [00:01<00:00,  3.19it/s]


In [22]:
n_pos_sentiment, n_neg_sentiment, n_other_attitude = 0, 0, 0

for i in range(len(fold_token_indices[0])):
    if len(fold_label_tuples[0][i]["target_to_label"]) or len(fold_label_tuples[0][i]["holder_and_target_to_label"]):
        length = fold_lens[0][i]
        token_indices = fold_token_indices[0][i][:length]
        tokens = tokenizer.convert_ids_to_tokens(token_indices)
        text = " ".join(tokens)
        print(text)

        for target_span, label in fold_label_tuples[0][i]["target_to_label"].items():
            if label < 4:
                holder_text = "WRITER"
            else:
                holder_text = "IMPLICIT"
            attitude_label = (label - 1)%3
            if attitude_label == 0:
                attitude = "pos"
                n_pos_sentiment += 1
            elif attitude_label == 1:
                attitude = "neg"
                n_neg_sentiment += 1
            else:
                attitude = "other"
                n_other_attitude += 1
            target_text = " ".join(tokens[target_span[0]: target_span[1]])
            print("{} --({})--> {}".format(holder_text, attitude, target_text))
        
        for holder_and_target_span, label in fold_label_tuples[0][i]["holder_and_target_to_label"].items():
            attitude_label = (label - 1)%3
            if attitude_label == 0:
                attitude = "pos"
                n_pos_sentiment += 1
            elif attitude_label == 1:
                attitude = "neg"
                n_neg_sentiment += 1
            else:
                attitude = "other"
                n_other_attitude += 1
            holder_text = " ".join(tokens[holder_and_target_span[0]: holder_and_target_span[1]])
            target_text = " ".join(tokens[holder_and_target_span[2]: holder_and_target_span[3]])
            print("{} --({})--> {}".format(holder_text, attitude, target_text))
        print("\n")

print("pos = {}, neg = {}, other = {}".format(n_pos_sentiment, n_neg_sentiment, n_other_attitude))

Argentina is now broke .
WRITER --(neg)--> Argentina


The opposition seeks intervention or blockade
The opposition --(pos)--> intervention
The opposition --(pos)--> blockade


Africa ' s Grand Bar ##gai ##n
WRITER --(pos)--> Bar ##gai ##n


That ' s a fair deal .
WRITER --(pos)--> That


Tokyo i ##rked by US stance on Kyoto
Tokyo --(neg)--> stance


Brazil hopes US will not interfere in greenhouse effect negotiations
Brazil --(pos)--> interfere


We demand equal treatment for our foreign policy . "
We --(pos)--> treatment


" These people are committed terrorists , " he said .
he --(neg)--> people
he --(neg)--> terrorists


Not everyone in Japan is upset with Bush ' s alternative .
Not everyone in Japan --(neg)--> alternative


While debate rage ##d elsewhere , Britain and Spain supported the United States .
Britain and Spain --(pos)--> States


The EU is concerned about the purely voluntary nature of the actions proposed .
The EU --(neg)--> nature
The EU --(neg)--> actions


K ##hat 

In [2]:
doc_ids_file = os.path.join(config.MPQA3_FOLDER, "doclist")
doc_ids = open(doc_ids_file).read().strip().split("\n")
docs = []

for doc_id in tqdm(doc_ids):
    processed_file = os.path.join(config.MPQA3_PROCESSED_FOLDER, doc_id, "tokenized.json")
    processed = json.load(open(processed_file))
    docs.append(processed)

100%|██████████| 70/70 [00:00<00:00, 2479.03it/s]


In [3]:
n_pos_sentiment, n_neg_sentiment, n_other_attitude = 0, 0, 0

for doc in docs:
    for sentence in doc:
        for ex in sentence["dse"]:
            if ex["target-type"] in ["entity", "event"]:
                if ex["attitude-type"] == "sentiment-pos":
                    n_pos_sentiment += 1
                elif ex["attitude-type"] == "sentiment-neg":
                    n_neg_sentiment += 1
                else:
                    n_other_attitude += 1

print("pos = {}, neg = {}, other = {}".format(n_pos_sentiment, n_neg_sentiment, n_other_attitude))

pos = 762, neg = 1919, other = 123


In [9]:
attitude_labels = []
attitudes = ["pos", "neg", "other"]

for i in range(len(fold_label_tuples)):
    for label_tuples in fold_label_tuples[i]:
        for attitude_label in label_tuples["target_to_label"].values():
            attitude_labels.append(attitudes[(attitude_label - 1)%3])
        for attitude_label in label_tuples["holder_and_target_to_label"].values():
            attitude_labels.append(attitudes[(attitude_label - 1)%3])

print(Counter(attitude_labels))

Counter({'neg': 1746, 'pos': 710, 'other': 60})


In [14]:
n_targets_with_more_than_one_attitude = 0
n_holder_and_target_with_more_than_one_attitude = 0

for doc in docs:
    for sentence in doc:
        target_to_n_attitudes = defaultdict(int)
        holder_and_target_to_n_attitudes = defaultdict(int)
        for ex in sentence["dse"]:
            if ex["target-type"] in ["entity","event"]:
                target_span = ex["target"]
                if ex["holder-type"] == "span":
                    holder_span = ex["holder"]
                    key = tuple(holder_span + target_span)
                    holder_and_target_to_n_attitudes[key] += 1
                else:
                    key = tuple(target_span)
                    target_to_n_attitudes[key] += 1
        n_targets_with_more_than_one_attitude += sum(n_attitude > 2 for n_attitude in target_to_n_attitudes.values())
        n_holder_and_target_with_more_than_one_attitude += sum(n_attitude > 2 for n_attitude in holder_and_target_to_n_attitudes.values())

print(n_targets_with_more_than_one_attitude)
print(n_holder_and_target_with_more_than_one_attitude)

7
13


In [1]:
set(range(4))

{0, 1, 2, 3}

In [2]:
set(range(2, 5)).intersection(range(3, 7))

{3, 4}

In [4]:
def f():
    raise ValueError("xxx")

In [6]:
f()

ValueError: xxx

In [14]:
2 == 3 == 1

False

In [15]:
int(True), int(False)

(1, 0)

In [16]:
list(range(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [6]:
print("{:4.1f}".format(9.1))
print("{:4.1f}".format(99.1))

 9.1
99.1
