In [38]:
!nvidia-smi

Tue Nov 17 08:10:01 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 27%   37C    P8    26W / 260W |   7203MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1B:00.0 Off |                  N/A |
| 27%   31C    P8    10W / 260W |   6215MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 27%   

In [39]:
import pandas as pd
import numpy as np
import pickle

from pathlib import Path

from box import Box
import pandas as pd
import collections
import os
import sys
import random

import datetime

from transformers import AutoTokenizer

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner

import re
MAX_SENTENCE_LEN = 82

from utils import preprocess
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

import torch
import torchtext
import torchtext.data as data
from torchtext.data import Field
from torchtext.data import Iterator, BucketIterator
from transformers import BertTokenizer, BertForMaskedLM, AdamW, DistilBertTokenizer


from matplotlib import pyplot as plt
%matplotlib inline

from functools import partial
import time
from IPython.core.debugger import set_trace
from tqdm import tqdm, trange
# from tqdm import tqdm, trange

seed = 678

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [40]:
torch.cuda.set_device(7)
cuda = torch.cuda.is_available()

## DistilBert

Loading data.

In [41]:
from ast import literal_eval

DATA_PATH = '../../../semeval2021_task/data/'

train = pd.read_csv(DATA_PATH + 'tsd_train.csv')
trial = pd.read_csv(DATA_PATH + 'tsd_trial.csv')

train['spans'] = train.spans.apply(literal_eval)
trial['spans'] = trial.spans.apply(literal_eval)
texts = list(train['text'])

Loading pretrained model.

In [42]:
CLASSIFICATION_MODEL_PATH = '../../models/distilbert/model_out'
CLASSIFICATION_LABELS_PATH = '../../labels'

from fast_bert.prediction import BertClassificationPredictor

predictor = BertClassificationPredictor(CLASSIFICATION_MODEL_PATH, CLASSIFICATION_LABELS_PATH, 
                                        multi_label=True, model_type='distilbert')

In [43]:
lern = predictor.get_learner()
pretrained_weights = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights)
MASK_INDEX = tokenizer.convert_tokens_to_ids("[MASK]")

lern.model.eval();

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [44]:
v = {v: k for k, v in tokenizer.vocab.items()}

def toks_to_words(token_ids):
    """ Merge subword tokens into whole words """
    indices = []
    for i, token_id in enumerate(token_ids):
        if i == 0:
            continue
        token_text = v[token_id]
        if token_text.startswith('##'):
            indices.append(i)
        else:
            if indices:
                toks = [v[token_ids[t]] for t in indices]
                word = ''.join([toks[0]] + [t[2:] for t in toks[1:]])
                new_indices = [index - 1 for index in indices]
                yield new_indices, word
            indices = [i]

In [45]:
from transformers import DistilBertConfig
config = DistilBertConfig.from_pretrained('distilbert-base-cased', output_attentions=True)
lern.model.distilbert.config = config

In [46]:
text = random.choice(texts)
print(text)

Years ago he was blaming Western values, including respect for life and democracy, for his misfortunes.  What a total clown.


In [47]:
toks = tokenizer.encode(text)

In [48]:
list(toks_to_words(toks))

[([0], 'years'),
 ([1], 'ago'),
 ([2], 'he'),
 ([3], 'was'),
 ([4], 'blaming'),
 ([5], 'western'),
 ([6], 'values'),
 ([7], ','),
 ([8], 'including'),
 ([9], 'respect'),
 ([10], 'for'),
 ([11], 'life'),
 ([12], 'and'),
 ([13], 'democracy'),
 ([14], ','),
 ([15], 'for'),
 ([16], 'his'),
 ([17, 18, 19], 'misfortunes'),
 ([20], '.'),
 ([21], 'what'),
 ([22], 'a'),
 ([23], 'total'),
 ([24], 'clown'),
 ([25], '.')]

The idea is to use the attentions from all heads and all layers as features for words, then later train some classifier on them

In [49]:
def shrink(nums_arr, att_arr):
    """
    When words are splitted by several tokens by BERT, each token has
    its own feature vector. this function takes the mean of these vectors
    and assign it to the original word.
    
    nums_arr: list, contains indices of tokens that need to be united back into
    one word
    
    att_arr: np.array of attention, shape (num_heads) x (num_words) or
    (num_layers) x (num_words)
    
    output: np.array, shape (num_words) x (num_heads) or 
    (num_words) x (num_layers)
    """
    res = []
    prev_i = 0
    for arr in nums_arr:#for each splitted word
        fig = att_arr[:, prev_i:arr[0]] #add all previous words
        nafig = att_arr[:, arr].mean(axis=1)[:, np.newaxis] #add mean of token features
        res.append(fig)
        res.append(nafig)
        prev_i = arr[-1] + 1 #make the current word "previous"
    if prev_i < att_arr.shape[1]: #just to be safe and not overstep the array
        res.append(att_arr[:, prev_i:])
    return np.hstack(res).T

In [50]:
def get_attention_for_words(text: str, model, tokenizer):
    """
    Get attention values for all heads averaged over all layers and 
    attention values for all layers averaged over all heads.
    """
    toks = tokenizer.encode(text) 
    emb_num = model.distilbert.embeddings.word_embeddings.num_embeddings
    toks = [i if i < emb_num else tokenizer.unk_token_id for i in toks]
    #these procedure is necessary because for some reason, 
    #tokenizer knows more words that BERT
    nums = [arr for (arr, string) in list(toks_to_words(toks)) if len(arr) > 1]
    sentence = torch.tensor(toks).unsqueeze(0)
    sentence = sentence.cuda()
    out = lern.model.distilbert(sentence, output_attentions=True,
                    output_hidden_states=True)
    #we cut the CLS and SEP tokens
    attentions = torch.cat(out[2], dim=0).cpu()[:, :, 1:-1, 1:-1]
    #take mean attention over all layers
    means_heads =  attentions.mean(dim=(0, 2)).detach().numpy()
    #take mean attention over all heads
    means_layers = attentions.mean(dim=(1, 2)).detach().numpy()
    means_heads = shrink(nums, means_heads)
    means_layers = shrink(nums, means_layers)

    return means_heads, means_layers

In [51]:
def getword(arr, text):
    """
    Using a span from a dataset, obtain a word
    """
    ans = ''
    for i in range(len(arr)):
        elem = arr[i]
        if  i != 0 and i != len(arr) - 1 and elem != arr[i-1] + 1:
            ans += ' '
            ans += text[elem]
        else:
            ans += text[elem]
    return ans

In [52]:
def make_target(span, text, model, tokenizer):
    target_toks = tokenizer.encode(getword(span, text))
    toks = tokenizer.encode(text)
    emb_num = model.distilbert.embeddings.word_embeddings.num_embeddings
    target_toks = [i if i < emb_num else tokenizer.unk_token_id for i in target_toks]
    toks = [i if i < emb_num else tokenizer.unk_token_id for i in toks]
    toks = [string for (arr, string) in list(toks_to_words(toks))]
    target_toks = [string for (arr, string) in list(toks_to_words(target_toks))]
    target = []
    #check if the word is in spanned words
    for tok in toks:
        if len(target_toks) > 0 and tok == target_toks[0]:
            target.append(1)
            target_toks = target_toks[1:]
        else:
            target.append(0)
    return toks, target
        

In [53]:
make_target(train.spans.iloc[0], train.text.iloc[0], lern.model, tokenizer)

(['another',
  'violent',
  'and',
  'aggressive',
  'immigrant',
  'killing',
  'a',
  'innocent',
  'and',
  'intelligent',
  'us',
  'citizen',
  '.',
  '.',
  '.',
  '.',
  'sarcasm'],
 [0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [54]:
target = []
data = np.array([])
for span, text in tqdm(zip(train['spans'], train['text'])):
    heads, layers = get_attention_for_words(text, lern.model, tokenizer)
    headlay = np.hstack((heads, layers))
    #print(headlay.shape)
    if len(data) == 0:
        data = headlay
    else:
        data = np.vstack((data, headlay))
    words, new_target = make_target(span, text, lern.model, tokenizer)
    #print(new_target)
    try:
        assert len(new_target) == headlay.shape[0]
    except:
        print(new_target)
        print(words)
        print(headlay.shape)
        print(len(words))
        print(text)
        break
    target += new_target
    

7939it [01:46, 74.59it/s] 


In [55]:
data.shape

(352307, 18)

In [56]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(data, target)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
trial_data = np.array([])
for text in tqdm(trial['text']):
    heads, layers = get_attention_for_words(text, lern.model, tokenizer)
    headlay = np.hstack((heads, layers))
    if len(trial_data) == 0:
        trial_data = headlay
    else:
        trial_data = np.vstack((trial_data, headlay))
    

100%|██████████| 690/690 [00:06<00:00, 113.46it/s]


In [58]:
train_preds = logreg.predict(data)

In [66]:
train_probas = logreg.predict_proba(data)[:, 1]

In [67]:
for i in range(10):
    train_pred = np.where(train_probas > i/10, 1, 0)
    print(train_pred.sum(), i)

352307 0
32225 1
5145 2
1979 3
1075 4
632 5
372 6
210 7
102 8
45 9


In [69]:
sum(target)

24942

In [68]:
trial_data.shape

(29652, 18)

In [64]:
trial_probas = logreg.predict_proba(trial_data)[:, 1]

In [None]:
trial_probas = logreg.predict_proba(trial_data)[:, 1]
trial_preds = np.where(trial_probas > 0.15, 1, 0)
from sklearn.metrics import accuracy_score, f1_score
print(f1_score(trial_preds, trial_target))

In [73]:
trial_target = []
for span, text in tqdm(zip(trial['spans'], trial['text'])):
    words, new_target = make_target(span, text, lern.model, tokenizer)
    trial_target += new_target
    

690it [00:00, 996.27it/s]


In [77]:
trial_probas = logreg.predict_proba(trial_data)[:, 1]
trial_preds = np.where(trial_probas > 0.22, 1, 0)
from sklearn.metrics import accuracy_score, f1_score
print(f1_score(trial_preds, trial_target))

0.11862068965517242
