In [29]:
import numpy as np
import pandas as pd
import os
import sys
import time

In [427]:
import torch
from torch.utils.data import Dataset, TensorDataset
import transformers # huggingface
# pretrained weights = 'bert-base-uncased'
from transformers import BertForSequenceClassification, BertTokenizer, AdamW

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [333]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y
        self._num_labels = np.max(y) + 1

    def num_labels(self):
        return self._num_labels

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

# Load Wiki dataset

In [237]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

I0110 17:14:37.334326 140734822974912 tokenization_utils.py:398] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/johnhallman/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [283]:
comment_df = pd.read_csv("../data/attack_annotated_comments.tsv", sep ='\t')
annotation_df = pd.read_csv("../data/attack_annotations.tsv",  sep='\t')

In [284]:
cutoff, max_length = 0.2, 1000

In [285]:
comment_df = comment_df.drop(columns=['logged_in', 'ns', 'sample'])
comment_df["comment"] = comment_df["comment"].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comment_df["comment"] = comment_df["comment"].apply(lambda x: x.replace("TAB_TOKEN", " "))

annotation_df = (annotation_df.groupby("rev_id")["attack"].mean() > cutoff)
annotation_df = annotation_df.to_frame().reset_index()

final_df = pd.merge(comment_df, annotation_df, how='inner', on=['rev_id'])
f = lambda s: np.array(tokenizer.encode(s, pad_to_max_length=True, max_length=max_length))

In [286]:
datasets = {}
for split in final_df['split'].unique(): # train, dev, test
    final_split = final_df[final_df['split'] == split]
    x = final_split['comment'].apply(f).values # tokenize!
    y = final_split['attack'].astype(int).values
    datasets[split] = CustomDataset(x, y)

NameError: name 'CustomDataset' is not defined

In [393]:
print(np.mean([len(s) > 512 for s in comment_df['comment']]))

0.20610370779534626


# Load fake news dataset

In [318]:
# fake news dataset
body_df = pd.read_csv("../data/fake_news_bodies.csv")
stance_df = pd.read_csv("../data/fake_news_stances.csv")

CPU times: user 85.4 ms, sys: 15 ms, total: 100 ms
Wall time: 103 ms


In [310]:
stance_df.head(2)

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree


In [311]:
body_df.head(2)

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...


In [319]:
%%time
idx_to_id = {body_id:i for (i, body_id) in enumerate(body_df['Body ID'])}
stance_to_idx = {stance: i for i, stance in enumerate(stance_df["Stance"].unique())}
separator = ' ' + tokenizer.sep_token + ' '

x_list = []
y_list = []
f = lambda s: np.array(tokenizer.encode(s, pad_to_max_length=True, max_length=1000))
for body_id, headline, stance in zip(stance_df["Body ID"], stance_df["Headline"], stance_df["Stance"]):
    body = body_df.iloc[idx_to_id[body_id]]['articleBody']
    text = headline + separator + body
    x_list.append(f(text))
    y_list.append(stance_to_idx[stance])

x = np.array(x_list)
y = np.array(y_list)

CPU times: user 9min 33s, sys: 865 ms, total: 9min 34s
Wall time: 9min 34s


In [334]:
dataset = CustomDataset(x, y)

In [396]:
lengths = []
for body_id, headline, stance in zip(stance_df["Body ID"], stance_df["Headline"], stance_df["Stance"]):
    body = body_df.iloc[idx_to_id[body_id]]['articleBody']
    text = headline + separator + body
    lengths.append(len(text))

In [422]:
print(np.mean(np.array(lengths)))

2283.9041263107342


In [423]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

I0111 10:40:35.677147 140734822974912 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/johnhallman/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0111 10:40:35.681282 140734822974912 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 4,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": 

In [424]:
x = torch.tensor([tokenizer.encode("testing", pad_to_max_length=True, max_length=512), 
                  tokenizer.encode("success!", pad_to_max_length=True, max_length=512),
                  tokenizer.encode("hello world!", pad_to_max_length=True, max_length=512)])

In [426]:
model(x, labels=torch.tensor([[1], [2], [3]]))

(tensor(1.4630, grad_fn=<NllLossBackward>),
 tensor([[-0.0240,  0.1454, -0.5355,  0.0729],
         [-0.0184,  0.1016, -0.5964, -0.0032],
         [-0.0064,  0.0452, -0.6172, -0.0529]], grad_fn=<AddmmBackward>))

In [429]:
optimizer = AdamW(model.parameters())

In [430]:
model.train()
for i in range(20):
    model.zero_grad()
    output = model(x, labels=torch.tensor([[1], [2], [3]]))
    loss = output[0]
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    loss.backward()
    optimizer.step()

In [431]:
model(x, labels=torch.tensor([[1], [2], [3]]))

(tensor(1.2090, grad_fn=<NllLossBackward>),
 tensor([[-5.0128,  2.1330,  1.6461,  1.4967],
         [-5.0695,  2.2992,  1.5283,  1.9602],
         [-5.5645,  2.3028,  1.8427,  1.7887]], grad_fn=<AddmmBackward>))

In [433]:
model(torch.tensor([1,2,3]).unsqueeze(0))

(tensor([[-5.0477,  1.7459,  2.0362,  1.6291]], grad_fn=<AddmmBackward>),)

In [434]:
model(np.array([[1],[2]]))

TypeError: 'int' object is not callable

In [436]:
int(torch.max(torch.tensor([1,2,3])))

3