In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/scratch/sagarsj42/transformers'

In [2]:
import time
import datetime
import random

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

from data_io import *

In [3]:
data_dir = os.path.join(os.getcwd(), '..', 'data', 'pan21-author-profiling-training-2021-03-14')
data_dir

'/scratch/sagarsj42/pan2021-profiling-hate-speech-spreaders/code/../data/pan21-author-profiling-training-2021-03-14'

In [4]:
save_dir = os.path.join(os.getcwd(), '..', 'save')
save_path = os.path.join(save_dir, 'bert_large_uncased_best_dev_acc.pth')
save_dir

'/scratch/sagarsj42/pan2021-profiling-hate-speech-spreaders/code/../save'

In [5]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')

There are 4 GPU(s) available.
We will use GPU: GeForce GTX 1080 Ti


In [6]:
en_train, en_dev = get_single_split(data_dir, lang='en')
en_train.shape, en_dev.shape

((32000, 3), (8000, 3))

In [7]:
en_train.sample(10)

Unnamed: 0,author_id,tweet,label
23187,a37a032f4a0c42bc3ba3663db6907bfb,I’m mad patient when it come to my Lil boy eve...,1
19521,88d8ac4ae2d91e681ee8a8d3b60fd911,#USER# Could try #USER# or #USER#,0
6309,3770a07b212c1096c26e5a1f1556fbd1,Each day I realize I’m not any different than ...,0
1121,0e86e9b6ba971cbc5a117c4af6fad9a2,NY Democrats Disappointed in Sen. Gillibrand's...,0
20264,8e5a604d6328d4b15d119b9601f5d3c2,RT #USER#: The fight will be over when you get...,0
10287,4cbee4f8451fb69d309b7a6ccf7709f9,"RT #USER#: #USER# That includes you, John Kerr...",0
12633,5b50fcecc61c850e961ff307bdce67a0,"#USER# Thought your tits were better natural ,...",0
19897,8b525999b04b19255b32365e49b281ac,You may need to step away for a break from you...,0
26185,b496caf332cb0ba97d2acefc44f153ac,As someone else said “Content of their Charact...,0
10979,5002d161db83e49e179af61703bccfc8,RT #USER#: Let the music take over the night,1


In [8]:
tweets = en_train.tweet.values
labels = en_train.label.values

tweets.shape, labels.shape

((32000,), (32000,))

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
tokenizer

PreTrainedTokenizer(name_or_path='bert-large-cased', vocab_size=28996, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [10]:
max_length = 0

for tweet in tweets:
    input_ids = tokenizer.encode(
        tweet,
        add_special_tokens = True
    )
    
    max_length = max(max_length, len(input_ids))
    
print('Max length:', max_length)

Max length: 81


In [11]:
def prepare_dataset(tokenizer, index, tweets, labels):
    input_ids = list()
    attention_masks = list()

    for tweet in tweets:
        encoded_dict = tokenizer.encode_plus(
            tweet,
            add_special_tokens = True,
            max_length = 128,
            pad_to_max_length = True,
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids)
    attention_masks = torch.cat(attention_masks)
    index = torch.tensor(index)
    labels = torch.tensor(labels)
    
    dataset = TensorDataset(index, input_ids, attention_masks, labels)
    
    return dataset

In [12]:
en_train_ds = prepare_dataset(tokenizer, en_train.index.values, en_train.tweet.values, en_train.label.values)
en_dev_ds = prepare_dataset(tokenizer, en_dev.index.values, en_dev.tweet.values, en_dev.label.values)

batch_size = 16

train_dataloader = DataLoader(
    en_train_ds,
    sampler = RandomSampler(en_train_ds),
    batch_size = batch_size
)

dev_dataloader = DataLoader(
    en_dev_ds,
    sampler = SequentialSampler(en_dev_ds),
    batch_size = batch_size
)

len(train_dataloader), len(dev_dataloader)



(2000, 500)

In [13]:
model = BertForSequenceClassification.from_pretrained(
    'bert-large-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

model = model.cuda()
model

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [15]:
epochs = 5
total_steps = len(train_dataloader) * epochs

optimizer = AdamW(
    model.parameters(),
    lr = 2e-5,
    eps = 1e-8
)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)


NameError: name 'model' is not defined

In [16]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [17]:
def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [17]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)

training_stats = list()
total_t0 = time.time()

for epoch in range(epochs):
    print('\n==== Epoch {:} / {:} ===='.format(epoch + 1, epochs))
    print('Training ....')
    
    t0 = time.time()
    total_train_loss = 0.0
    total_train_accuracy = 0.0
    model.train()
    
    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('\t\tBatch {:>5,} of {:>5,}. Elapsed {:}'.format(step, len(train_dataloader), elapsed))
            
        b_index, b_input_ids, b_attention_masks, b_labels = tuple(b.to(device) for b in batch)

        model.zero_grad()

        outputs = model(
            b_input_ids,
            token_type_ids = None,
            attention_mask = b_attention_masks,
            labels = b_labels
        )
        loss = outputs['loss']
        logits = outputs['logits']

        loss.backward()

        total_train_loss += loss.item()
        total_train_accuracy += flat_accuracy(logits.detach().clone().cpu().numpy(),
                                              b_labels.detach().clone().to('cpu').numpy())

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    
    print('\tAverage training loss: {0:.2f}'.format(avg_train_loss))
    print('\tAverage training accuracy: {0:.2f}'.format(avg_train_accuracy * 100))
    print('\tTraining took: {:}'.format(training_time))
    print('\nRunning validation ....')

    t0 = time.time()
    total_dev_loss = 0.0
    total_dev_accuracy = 0.0
    best_dev_accuracy = 0.0
    model.eval()

    for batch in dev_dataloader:
        b_index, b_input_ids, b_attention_masks, b_labels = tuple(b.to(device) for b in batch)

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                token_type_ids = None,
                attention_mask = b_attention_masks,
                labels = b_labels
            )
            loss = outputs['loss']
            logits = outputs['logits']

        total_dev_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()
        total_dev_accuracy += flat_accuracy(logits, labels)
    
    avg_dev_loss = total_dev_loss / len(dev_dataloader)
    avg_dev_accuracy = total_dev_accuracy / len(dev_dataloader)
    dev_time = format_time(time.time() - t0)
    
    if avg_dev_accuracy > best_dev_accuracy:
        best_dev_accuracy = avg_dev_accuracy
        torch.save(model.state_dict(), save_path)
    
    print('\tAverage dev loss: {0:.2f}'.format(avg_dev_loss))
    print('\tAverage dev accuracy: {0:.2f}%'.format(avg_dev_accuracy * 100))
    print('\tValidation took: {:}'.format(dev_time))
    
    training_stats.append(
        {
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'train_accuracy': avg_train_accuracy,
            'dev_loss': avg_dev_loss,
            'dev_accuracy': avg_dev_accuracy,
            'train_time': training_time,
            'dev_time': dev_time
        }
    )

print('Training complete!')
print('Total training took {:} (h:mm:ss)'.format(format_time(time.time() - total_t0)))


==== Epoch 1 / 5 ====
Training ....
		Batch   100 of 2,000. Elapsed 0:01:02
		Batch   200 of 2,000. Elapsed 0:02:06
		Batch   300 of 2,000. Elapsed 0:03:11
		Batch   400 of 2,000. Elapsed 0:04:15
		Batch   500 of 2,000. Elapsed 0:05:20
		Batch   600 of 2,000. Elapsed 0:06:25
		Batch   700 of 2,000. Elapsed 0:07:29
		Batch   800 of 2,000. Elapsed 0:08:34
		Batch   900 of 2,000. Elapsed 0:09:40
		Batch 1,000 of 2,000. Elapsed 0:10:45
		Batch 1,100 of 2,000. Elapsed 0:11:49
		Batch 1,200 of 2,000. Elapsed 0:12:54
		Batch 1,300 of 2,000. Elapsed 0:13:59
		Batch 1,400 of 2,000. Elapsed 0:15:04
		Batch 1,500 of 2,000. Elapsed 0:16:09
		Batch 1,600 of 2,000. Elapsed 0:17:13
		Batch 1,700 of 2,000. Elapsed 0:18:18
		Batch 1,800 of 2,000. Elapsed 0:19:23
		Batch 1,900 of 2,000. Elapsed 0:20:28
	Average training loss: 0.70
	Average training accuracy: 53.36
	Training took: 0:21:33

Running validation ....
	Average dev loss: 0.72
	Average dev accuracy: 47.76%
	Validation took: 0:01:42

==== Epoch

KeyboardInterrupt: 

In [13]:
model_loaded = torch.load(os.path.join(save_dir, 'best_bert_large_cased_dev.pth'))
model_loaded = model_loaded.cuda()
model_loaded

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [22]:
model_loaded.eval()
t0 = time.time()
total_dev_loss = 0
total_dev_accuracy = 0

for batch in dev_dataloader:
    b_index, b_input_ids, b_attention_masks, b_labels = tuple(b.to(device) for b in batch)

    with torch.no_grad():
        outputs = model_loaded(
            b_input_ids,
            token_type_ids = None,
            attention_mask = b_attention_masks,
            labels = b_labels
        )
        loss = outputs['loss']
        logits = outputs['logits']

    total_dev_loss += loss.item()
    logits = logits.detach().cpu().numpy()
    labels = b_labels.to('cpu').numpy()
    total_dev_accuracy += flat_accuracy(logits, labels)

avg_dev_loss = total_dev_loss / len(dev_dataloader)
avg_dev_accuracy = total_dev_accuracy / len(dev_dataloader)
dev_time = format_time(time.time() - t0)

print('\tAverage dev loss: {0:.2f}'.format(avg_dev_loss))
print('\tAverage dev accuracy: {0:.2f}%'.format(avg_dev_accuracy * 100))
print('\tValidation took: {:}'.format(dev_time))

	Average dev loss: 0.69
	Average dev accuracy: 54.73%
	Validation took: 0:01:40


In [40]:
b_index.to('cpu').numpy(), logits.argmax(axis=1).flatten()

(array([39584, 39585, 39586, 39587, 39588, 39589, 39590, 39591, 39592,
        39593, 39594, 39595, 39596, 39597, 39598, 39599]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]))

In [19]:
model_pretrained = BertForSequenceClassification.from_pretrained(
    'bert-large-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

model_pretrained = model_pretrained.cuda()
model_pretrained

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [21]:
model_pretrained.eval()
t0 = time.time()
total_dev_loss = 0
total_dev_accuracy = 0

for batch in dev_dataloader:
    b_index, b_input_ids, b_attention_masks, b_labels = tuple(b.to(device) for b in batch)

    with torch.no_grad():
        outputs = model_pretrained(
            b_input_ids,
            token_type_ids = None,
            attention_mask = b_attention_masks,
            labels = b_labels
        )
        loss = outputs['loss']
        logits = outputs['logits']

    total_dev_loss += loss.item()
    logits = logits.detach().cpu().numpy()
    labels = b_labels.to('cpu').numpy()
    total_dev_accuracy += flat_accuracy(logits, labels)

avg_dev_loss = total_dev_loss / len(dev_dataloader)
avg_dev_accuracy = total_dev_accuracy / len(dev_dataloader)
dev_time = format_time(time.time() - t0)

print('\tAverage dev loss: {0:.2f}'.format(avg_dev_loss))
print('\tAverage dev accuracy: {0:.2f}%'.format(avg_dev_accuracy * 100))
print('\tValidation took: {:}'.format(dev_time))

	Average dev loss: 0.80
	Average dev accuracy: 50.00%
	Validation took: 0:01:36


In [42]:
index = b_index.to('cpu').numpy()
en_dev.loc[index]

Unnamed: 0,author_id,tweet,label
39584,fdb9f16899e3097e6db1f6a13d3572f8,U.S. expands 'Remain in Mexico' policy at dang...,1
39585,fdb9f16899e3097e6db1f6a13d3572f8,Trump moves to end asylum protections for Cent...,1
39586,fdb9f16899e3097e6db1f6a13d3572f8,Dozens block Phoenix street in anti-migrant de...,1
39587,fdb9f16899e3097e6db1f6a13d3572f8,"Anti-ICE protesters pull down American flag, r...",1
39588,fdb9f16899e3097e6db1f6a13d3572f8,Let's hope it happens this time. #URL#,1
39589,fdb9f16899e3097e6db1f6a13d3572f8,RT #USER#: #HASHTAG# U.S. Border Patrol agents...,1
39590,fdb9f16899e3097e6db1f6a13d3572f8,"Marietta teen arrested for raping, molesting c...",1
39591,fdb9f16899e3097e6db1f6a13d3572f8,RT #USER#: Great job by #HASHTAG# Border Patro...,1
39592,fdb9f16899e3097e6db1f6a13d3572f8,Hopefully this lawsuit is successful in n shut...,1
39593,fdb9f16899e3097e6db1f6a13d3572f8,RT #USER#: This is the kind of chicken shit li...,1
