In [1]:
import numpy as np
import pandas as pd
import os
import sys
import time

In [61]:
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import Dataset, TensorDataset
import transformers # huggingface
# pretrained weights = 'bert-base-uncased'
from transformers import BertPreTrainedModel, BertModel, BertTokenizer, AdamW

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Wiki dataset

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

I0111 12:45:41.105871 140734958245312 tokenization_utils.py:398] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/johnhallman/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [5]:
comment_df = pd.read_csv("../data/attack_annotated_comments.tsv", sep ='\t')
annotation_df = pd.read_csv("../data/attack_annotations.tsv",  sep='\t')

In [6]:
cutoff, max_length = 0.2, 1000

In [7]:
comment_df = comment_df.drop(columns=['logged_in', 'ns', 'sample'])
comment_df["comment"] = comment_df["comment"].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comment_df["comment"] = comment_df["comment"].apply(lambda x: x.replace("TAB_TOKEN", " "))

annotation_df = (annotation_df.groupby("rev_id")["attack"].mean() > cutoff)
annotation_df = annotation_df.to_frame().reset_index()

final_df = pd.merge(comment_df, annotation_df, how='inner', on=['rev_id'])
f = lambda s: np.array(tokenizer.encode(s, pad_to_max_length=True, max_length=max_length))

In [8]:
datasets = {}
for split in final_df['split'].unique(): # train, dev, test
    final_split = final_df[final_df['split'] == split]
    x = final_split['comment'].apply(f).values # tokenize!
    y = final_split['attack'].astype(int).values
    datasets[split] = CustomDataset(x, y)

KeyboardInterrupt: 

In [None]:
print(np.mean([len(s) > 512 for s in comment_df['comment']]))

# Load fake news dataset

In [None]:
# fake news dataset
body_df = pd.read_csv("../data/fake_news_bodies.csv")
stance_df = pd.read_csv("../data/fake_news_stances.csv")

In [None]:
stance_df.head(2)

In [None]:
body_df.head(2)

In [None]:
%%time
idx_to_id = {body_id:i for (i, body_id) in enumerate(body_df['Body ID'])}
stance_to_idx = {stance: i for i, stance in enumerate(stance_df["Stance"].unique())}
separator = ' ' + tokenizer.sep_token + ' '

x_list = []
y_list = []
f = lambda s: np.array(tokenizer.encode(s, pad_to_max_length=True, max_length=1000))
for body_id, headline, stance in zip(stance_df["Body ID"], stance_df["Headline"], stance_df["Stance"]):
    body = body_df.iloc[idx_to_id[body_id]]['articleBody']
    text = headline + separator + body
    x_list.append(f(text))
    y_list.append(stance_to_idx[stance])

x = np.array(x_list)
y = np.array(y_list)

In [None]:
dataset = CustomDataset(x, y)

In [None]:
lengths = []
for body_id, headline, stance in zip(stance_df["Body ID"], stance_df["Headline"], stance_df["Stance"]):
    body = body_df.iloc[idx_to_id[body_id]]['articleBody']
    text = headline + separator + body
    lengths.append(len(text))

In [None]:
print(np.mean(np.array(lengths)))

# Great! Let's prepare some models

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

In [26]:
x = torch.tensor([tokenizer.encode("testing", pad_to_max_length=True, max_length=512), 
                  tokenizer.encode("success!", pad_to_max_length=True, max_length=512),
                  tokenizer.encode("hello world!", pad_to_max_length=True, max_length=512)])

In [None]:
pred = model(x, labels=torch.tensor([[1], [2], [3]]))
pred

In [None]:
torch.argmax(pred[1], axis=1)

# Let's try a multi-headed model

In [80]:
class BertMultiHeadModel(BertPreTrainedModel): # FAKE SEQUENCE MODEL
    def __init__(self, config):
        super(BertMultiHeadModel, self).__init__(config)
        self.num_labels = config.num_labels # should be a list!
        self.num_tasks = 2 # CUSTOM EDIT: MANUALLY SPECIFIED NUM_TASKS

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = [nn.Linear(config.hidden_size, self.config.num_labels[i]) for i in range(self.num_tasks)]

        self.init_weights()

    def forward(
        self,
        task,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        if type(task) != int:
            raise Exception("BertMulti model first input must be task index (int)!")
        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier[task](pooled_output) # CUSTOM EDIT: specify which linear layer with task index

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels[task]), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [81]:
multi_model = BertMultiHeadModel.from_pretrained('bert-base-uncased', num_labels=[2,4])

I0111 13:26:08.531738 140734958245312 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/johnhallman/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0111 13:26:08.534708 140734958245312 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": [
    2,
    4
  ],
  "output_attentions": false,
  "output_hidden_states": false,


In [82]:
x = torch.tensor([tokenizer.encode("testing", pad_to_max_length=True, max_length=512), 
                  tokenizer.encode("success!", pad_to_max_length=True, max_length=512),
                  tokenizer.encode("hello world!", pad_to_max_length=True, max_length=512)])
y = torch.tensor([[1], [2], [3]])

In [83]:
print(x)
print(y)

tensor([[ 101, 5604,  102,  ...,    0,    0,    0],
        [ 101, 3112,  999,  ...,    0,    0,    0],
        [ 101, 7592, 2088,  ...,    0,    0,    0]])
tensor([[1],
        [2],
        [3]])


In [84]:
multi_model(1, x)

(tensor([[-0.6370, -0.4950, -0.1461,  0.1204],
         [-0.5994, -0.5887, -0.1453,  0.1337],
         [-0.5696, -0.6273, -0.1401,  0.1375]], grad_fn=<AddmmBackward>),)

In [85]:
output = multi_model(1, x, labels=y)
output

(tensor(1.3053, grad_fn=<NllLossBackward>),
 tensor([[-0.6370, -0.4950, -0.1461,  0.1204],
         [-0.5994, -0.5887, -0.1453,  0.1337],
         [-0.5696, -0.6273, -0.1401,  0.1375]], grad_fn=<AddmmBackward>))

tensor([1, 1, 1])