In [18]:
import numpy as np
import pandas as pd
import time
import sys
import os
import argparse
import shutil

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, WeightedRandomSampler, SequentialSampler

from bert_dataloader import get_wiki_data, get_fake_data
from bert_models import BertMultiHeadModel # Custom model 

from transformers import BertModel, BertTokenizer, AdamW, BertPreTrainedModel, get_linear_schedule_with_warmup

In [3]:
ls

Debug Models.ipynb        bert_dataloader.py        models.py
Huggingface Bert.ipynb    bert_models.py            [1m[34mprecomputed[m[m/
Multi Head model.ipynb    bert_train.py             train.py
Test dataloader.ipynb     bert_train_multi_head.py  [1m[34mwiki[m[m/
[1m[34m__pycache__[m[m/              dataloader.py


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
wiki_data = get_wiki_data(tokenizer)
fake_data = get_fake_data(tokenizer)

I0112 00:35:30.179898 140734747264448 tokenization_utils.py:398] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/johnhallman/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


wiki data files exist already! loading precomputed values


  self.x = torch.tensor(x)
  self.y = torch.tensor(y)


fake news data files exist already! loading precomputed values


In [8]:
wiki_train, wiki_dev, wiki_test = wiki_data['train'], wiki_data['dev'], wiki_data['test']
fake_train, fake_dev, fake_test = fake_data['train'], fake_data['dev'], fake_data['test']
wiki_num_labels, fake_num_labels = wiki_train.num_labels(), fake_train.num_labels()
wiki_n, fake_n = len(wiki_train), len(fake_train)

In [10]:
model = BertMultiHeadModel.from_pretrained('bert-base-uncased', num_labels=[2, 4])

I0112 00:39:45.302918 140734747264448 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/johnhallman/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0112 00:39:45.304229 140734747264448 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": [
    2,
    4
  ],
  "output_attentions": false,
  "output_hidden_states": false,


In [11]:
# evaluate model accuracy and loss on given dataset
def evaluate(model, wiki_data, fake_data, batch_size=32, debugging=False):
    t_start = time.time()
    model.eval()
    wiki_loader = DataLoader(wiki_data, batch_size=batch_size)
    fake_loader = DataLoader(fake_data, batch_size=batch_size)
    for name, loader, n in zip(['wiki', 'fake'], [wiki_loader, fake_loader], [len(wiki_data), len(fake_data)]):
        predictions = np.zeros(n) # used for confusion matrix
        truth = np.zeros(n)
        total_loss = 0
        curr = 0
        with torch.no_grad():
            for (x, y) in dataloader:
                pred = model(x, labels=y)
                predictions[curr:min(n,curr+batch_size)] = torch.argmax(pred[1], axis=1)
                truth[curr:min(n,curr+batch_size)] = y
                total_loss += pred[0].item()
                curr += batch_size
                if debugging: break # one batch if debugging
        mean_loss = total_loss / n
        mean_accuracy = np.mean(predictions == truth)
        time_taken = time.time() - t_start
        print("evaluation for " + name)
        print("time {}: loss {}, accuracy {}, mean prediction {}".format(
            time_taken, mean_loss, mean_accuracy, np.mean(predictions)))

def create_sampler(train):
    frequencies = {}
    for pair in train: # pair = (x, y)
        if pair[1].item() not in frequencies:
            frequencies[pair[1].item()] = 0
        frequencies[pair[1].item()] += 1
    weights = []
    for pair in train:
        weights.append(1/frequencies[pair[1].item()])
    sampler = WeightedRandomSampler(weights=weights, num_samples=len(train))
    return sampler

In [19]:
balance=True

# create dataloaders
wiki_sampler = create_sampler(wiki_train) if balance else SequentialSampler(wiki_train)
wiki_dataloader = DataLoader(wiki_train, sampler=wiki_sampler, batch_size=32)
fake_sampler = create_sampler(fake_train) if balance else SequentialSampler(fake_train)
fake_dataloader = DataLoader(fake_train, sampler=fake_sampler, batch_size=32)

model = BertMultiHeadModel.from_pretrained('bert-base-uncased', num_labels=[2, 4])
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = min(len(wiki_dataloader), len(fake_dataloader)) * 2 # number of batches * number of epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

I0112 10:37:03.231394 140734747264448 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/johnhallman/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0112 10:37:03.232532 140734747264448 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": [
    2,
    4
  ],
  "output_attentions": false,
  "output_hidden_states": false,


In [21]:
debugging=False
model.train()
for epoch in range(1 if debugging else args['epochs']):
    for (x_wiki, y_wiki), (x_fake, y_fake) in zip(wiki_dataloader, fake_dataloader): # different shuffle each time
        optimizer.zero_grad()
        # train on wiki
        output = model(0, x_wiki, labels=y_wiki) # 0 => wiki head
        loss, preds = output[0], output[1]
        total_loss += loss.item()
        curr += args['batch_size']
        loss.backward() # loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip_grad_norm'])
        optimizer.step()
        scheduler.step()
        # train on fake news
        output = model(1, x_fake, labels=y_fake) # 1 => fake news head
        loss, preds = output[0], output[1]
        total_loss += loss.item()
        curr += args['batch_size']
        loss.backward() # loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip_grad_norm'])
        optimizer.step()
        scheduler.step()
        if debugging: break # only 1 batch when debugging

NameError: name 'args' is not defined

In [None]:
model.train()
for epoch in range(1 if debugging else args['epochs']):
    for (x_batch, y_batch) in train_dataloader: # different shuffle each time
        optimizer.zero_grad()
        output = model(x_batch, labels=y_batch)
        loss, preds = output[0], output[1]
        predictions[curr:min(n,curr+args['batch_size'])] = torch.argmax(preds, axis=1)
        truth[curr:min(n,curr+args['batch_size'])] = y_batch
        total_loss += loss.item()
        curr += args['batch_size']
        loss.backward() # loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip_grad_norm'])
        optimizer.step()
        scheduler.step()