# Prepare imports

In [1]:
import numpy as np
import pandas as pd
import os

import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import Dataset, TensorDataset

from bertviz import head_view
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, PretrainedConfig, BertPreTrainedModel

I0114 11:09:33.442543 140734975698368 file_utils.py:35] PyTorch version 1.3.1 available.
I0114 11:09:36.255414 140734975698368 file_utils.py:48] TensorFlow version 2.0.0 available.


In [99]:
class BertMultiHeadModel(BertPreTrainedModel): # FAKE SEQUENCE MODEL
    def __init__(self, config):
        super(BertMultiHeadModel, self).__init__(config)
        self.num_labels = [2, 4] # ignore config.num_labels # should be a list!
        self.num_tasks = 2 # CUSTOM EDIT: MANUALLY SPECIFIED NUM_TASKS
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = [nn.Linear(config.hidden_size, self.num_labels[i]) for i in range(self.num_tasks)]
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        task = 0 # 0 for wiki, 1 for fake news
        if type(task) != int:
            raise Exception("BertMulti model first input must be task index (int)!")
        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier[task](pooled_output) # CUSTOM EDIT: specify which linear layer with task index
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels[task]), labels.view(-1))
            outputs = (loss,) + outputs
        return outputs  # (loss), logits, (hidden_states), (attentions)

In [3]:
%%javascript
require.config({
  paths: {
      d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',
      jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
  }
});

<IPython.core.display.Javascript object>

# Load data

In [4]:
# fake news
body_df = pd.read_csv("../nlp-final-project/data/fake_news_bodies.csv")
stance_df = pd.read_csv("../nlp-final-project/data/fake_news_stances.csv")
stance_to_idx = {}
stances = stance_df["Stance"].drop_duplicates().values
for i, stance in enumerate(stances):
    stance_to_idx[stance] = i
num_stances = len(stance_to_idx)

x_list = []
y_list = []
idx_to_id = {body_id:i for (i, body_id) in enumerate(body_df['Body ID'])}

for body_id, headline, stance in zip(stance_df["Body ID"], stance_df["Headline"], stance_df["Stance"]):
    body = body_df.iloc[idx_to_id[body_id]]["articleBody"]
    x_list.append(headline + " [SEP] " + body)
    y_list.append(stance_to_idx[stance])

In [5]:
print(stance_df['Stance'].unique())

['unrelated' 'agree' 'disagree' 'discuss']


In [61]:
# wiki
cutoff = 0.3
comment_df = pd.read_csv("../nlp-final-project/data/attack_annotated_comments.tsv", sep ='\t')
comment_df = comment_df.drop(columns=['logged_in', 'ns', 'sample'])
comment_df["comment"] = comment_df["comment"].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comment_df["comment"] = comment_df["comment"].apply(lambda x: x.replace("TAB_TOKEN", " "))

annotation_df = pd.read_csv("../nlp-final-project/data/attack_annotations.tsv",  sep='\t')
annotation_df = (annotation_df.groupby("rev_id")["attack"].mean() > cutoff)
annotation_df = annotation_df.to_frame().reset_index()
final_df = pd.merge(comment_df, annotation_df, how='inner', on=['rev_id'])

In [62]:
final_df.head()

Unnamed: 0,rev_id,comment,year,split,attack
0,37675,`- This is not ``creative``. Those are the di...,2002,train,False
1,44816,` :: the term ``standard model`` is itself le...,2002,train,False
2,49851,"True or false, the situation as of March 200...",2002,train,False
3,89320,"Next, maybe you could work on being less cond...",2002,dev,True
4,93890,This page will need disambiguation.,2002,train,False


# Load multi-head model

In [6]:
do_lower_case = True
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=do_lower_case)

I0114 11:09:48.550842 140734975698368 tokenization_utils.py:398] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/johnhallman/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [100]:
model_version = "../nlp-final-project/multi-epoch-11/pytorch_model.bin"
config_name = PretrainedConfig().from_json_file("../nlp-final-project/multi-epoch-11/config.json")
multi_model = BertMultiHeadModel.from_pretrained(model_version, config=config_name)

I0114 14:47:44.779740 140734975698368 modeling_utils.py:403] loading weights file ../nlp-final-project/multi-epoch-11/pytorch_model.bin


# Load single-head model (wiki and fake news)

In [97]:
untrained_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', config=config_name)

I0114 14:47:18.205194 140734975698368 modeling_utils.py:406] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin from cache at /Users/johnhallman/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
I0114 14:47:21.815044 140734975698368 modeling_utils.py:480] Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
I0114 14:47:21.816293 140734975698368 modeling_utils.py:483] Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']


In [63]:
model_version = "../nlp-final-project/fake-epoch-10/pytorch_model.bin"
config_name = PretrainedConfig().from_json_file("../nlp-final-project/fake-epoch-10/config.json")
fake_model = BertForSequenceClassification.from_pretrained(model_version, config=config_name)

I0114 13:00:18.270956 140734975698368 modeling_utils.py:403] loading weights file ../nlp-final-project/fake-epoch-10/pytorch_model.bin


In [78]:
model_version = "../nlp-final-project/wiki-epoch-10/pytorch_model.bin"
config_name = PretrainedConfig().from_json_file("../nlp-final-project/wiki-epoch-10/config.json")
wiki_model = BertForSequenceClassification.from_pretrained(model_version, config=config_name)

I0114 13:46:40.481781 140734975698368 modeling_utils.py:403] loading weights file ../nlp-final-project/wiki-epoch-10/pytorch_model.bin


# Test classification

In [66]:
def classify(model, sentence, verbose=True):
    output = model(torch.tensor(tokenizer.encode(sentence, pad_to_max_length=True, max_length=512)).unsqueeze(0)) # head = 1
    if verbose:
        print(int(torch.argmax(output[0], axis=1)[0]), " - ", output[0])
    return torch.argmax(output[0], axis=1)

In [67]:
print("single")
for i in range(2):
    classify(fake_model, x_list[i])
    print(y_list[i])
print("multi")
for i in range(2):
    classify(multi_model, x_list[i])
    print(y_list[i])

single
0  -  tensor([[ 3.9179, -0.7485, -2.7601, -0.2044]], grad_fn=<AddmmBackward>)
0
1  -  tensor([[-3.0004,  1.4779,  0.1758,  0.9192]], grad_fn=<AddmmBackward>)
1
multi
2  -  tensor([[-0.1482, -0.5492,  0.2845, -0.2834]], grad_fn=<AddmmBackward>)
0
1  -  tensor([[ 0.0770,  0.1975,  0.0125, -0.1444]], grad_fn=<AddmmBackward>)
1


# Visualization code

In [12]:
def show_head_view(model, tokenizer, sentence_a, sentence_b=None):
    inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']
    attention = model(input_ids, token_type_ids=token_type_ids)[-1]
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
    if sentence_b:
        sentence_b_start = token_type_ids[0].tolist().index(1)
    else:
        sentence_b_start = None
    head_view(attention, tokens, sentence_b_start)

In [13]:
print(stance_df['Stance'].unique())

['unrelated' 'agree' 'disagree' 'discuss']


In [48]:
def concat(s_list):
    s = ""
    for i in s_list: s += i + " "
    return s[:-1]

def get_sentences(i, max_length=48):
    s = concat(x_list[i].split()[:max_length])
    try:
        s_a, s_b = s.split('[SEP]')[0], s.split('[SEP]')[1]
    except Exception as e:
        #print(s)
        s, s_a, s_b = None, None, None
    return s, s_a, s_b

In [49]:
s, _, _ = get_sentences(0)
s

"Police find mass graves with at least '15 bodies' near Mexico town where 43 students disappeared after police clash [SEP] Danny Boyle is directing the untitled film Seth Rogen is being eyed to play Apple co-founder Steve Wozniak in Sony’s Steve Jobs biopic. Danny Boyle is directing the"

In [112]:
count = 0
for i in range(100000):
    s = final_df['comment'].iloc[i]
    if len(s.split()) < 50 and final_df['attack'].iloc[i]:
        count += 1
        print("\ni: ", i)
        print(s)
        if count == 20:
            break


i:  52
`  :Click on my ``Annoying Users`` link! I guess Massachusetts is a republic, then?  Defending facts against the lies of determined zealots seems to be a particular sticking-place for Wikipedia (   02:58 Feb 21, 2003 (UTC)`

i:  60
  Iraq is not good  ===  ===  USA is bad   

i:  102
Anon  :What the heck are you talking about? This is an encyclopedia, not a book store. 

i:  121
`  Buddha - ``Some suggest that victims should be referred to additional resources and made aware of their rights under policy and law.``  That's an incredibly silly way to phrase that.  `

i:  351
i have a dick, its bigger than yours! hahaha

i:  418
`  Robert, Personal attacks are not acceptable on Wikipedia, e.g. ``You must be braindead``. See Wikiquette for more details. Thanks.  `

i:  463
  :get the damn chronology correct.  I don't think your tone is appropriate, and I do not appreciate it.  

i:  492
  == renault ==  you sad little bpy for driving a renault clio which has no vaa voom so there an

In [113]:
#model = untrained_model
#model = fake_model
model = wiki_model
#model = multi_model

"""
s_a = "this guy is clearly a moron"
s_b = "you are an idiot"
#s_a = "yes this is true"
#s_b = "no this is false"
s = s_a + " [SEP] " + s_b
classify(fake_model, s)
classify(wiki_model, s)
classify(multi_model, s)
"""

"""
count = 0
i_hold = 0
for i in range(320, len(x_list)):
    print("i: ", i)
    if y_list[i] not in [0]: continue
    s, s_a, s_b = get_sentences(i, max_length=40)
    if s == None: continue
    pred = classify(model, x_list[i], verbose=False)
    #if y_list[i] != pred:
    if len(s) < 512:
        count += 1
        if count == 1: # iterate through dataset with this index
            i_hold = i
            break

print("ihold: ", i_hold)
print(s_a)
print("\n---------------\n")
print(s_b)
print(y_list[i_hold])
classify(model, x_list[i_hold])
"""

s = final_df['comment'].iloc[802]

if len(s) < 512:
    show_head_view(model, tokenizer, s)
    #show_head_view(model, tokenizer, s_a, s_b)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>