In [2]:
from transformers import BertModel, BertTokenizer, BertConfig, BertForSequenceClassification
import os
import pandas as pd
import torch
import transformers
import time
import datetime
import numpy as np
import random
import os
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from IPython.core.display import display, HTML

In [3]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dataset = { "name": "Stanford treebank",
            "prefix": "stanford_treebank",
            "train_path": "data/stanford_treebank/sst_train.csv",
            "dev_path": "data/stanford_treebank/sst_dev.csv",
            "test_path": "data/stanford_treebank/sst_test.csv",
            'classes': ['neg', 'pos']
          }

In [9]:
def read_data():
    train = pd.read_csv(dataset['train_path'],index_col=0)
    dev = pd.read_csv(dataset['dev_path'],index_col=0)
    test = pd.read_csv(dataset['test_path'],index_col=0)
    return train, dev, test
train, dev, test = read_data()

In [10]:
def encode(data, tokenizer):
    input_ids = []
    attention_mask = []
    for text in data:
        tokenized_text = tokenizer.encode_plus(text,
                                            max_length=128,
                                            add_special_tokens = True,
                                            pad_to_max_length=True,
                                            padding_side='right',
                                            return_attention_mask=True)
        input_ids.append(tokenized_text['input_ids'])
        attention_mask.append(tokenized_text['attention_mask'])
    return torch.tensor(input_ids, dtype=torch.long), torch.tensor(attention_mask, dtype=torch.long)

In [11]:
def get_batches(df, tokenizer, batch_size=4):
    x, y = list(df['text'].values), list(df['classification'].apply(lambda y: dataset['classes'].index(y)))
    input_ids, attention_mask = encode(x, tokenizer)
    y = torch.tensor(y)
    tensor_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, y)
    tensor_randomsampler = torch.utils.data.RandomSampler(tensor_dataset)
    tensor_dataloader = torch.utils.data.DataLoader(tensor_dataset, sampler=tensor_randomsampler, batch_size=batch_size)
    return tensor_dataloader

In [12]:
def train_model(batch, model, optimizer, scheduler, epochs):
    model.train()  # Set the mode to training
    for e in range(epochs):
        for i, batch_tuple in enumerate(batch):
            batch_tuple = (t.to(device) for t in batch_tuple)
            input_ids, attention_mask, labels = batch_tuple
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits, hidden_states_output, attention_mask_output = outputs
            if i % 100 == 0:
                print("loss - {0}".format(loss))
            model.zero_grad()        
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), parameters['max_grad_norm'])
            optimizer.step()
            scheduler.step()

In [33]:
#batch_dev = get_batches(dev, batch_size=1, tokenizer=tokenizer)
#batch_train = get_batches(train_data, batch_size=8, tokenizer=tokenizer)
#batch_test = get_batches(test, batch_size=1, tokenizer=tokenizer)

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
batch_dev = get_batches(dev, batch_size=1, tokenizer=tokenizer)
batch_train = get_batches(train, batch_size=8, tokenizer=tokenizer)
batch_test = get_batches(test, batch_size=1, tokenizer=tokenizer)

epochs=1
parameters = {
    'learning_rate': 2e-5,
    'num_warmup_steps': 1000,
    'num_training_steps': len(batch_train) * epochs,
    'max_grad_norm': 1
}
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_hidden_states=True, output_attentions=True)
model.to(device)
optimizer = transformers.AdamW(model.parameters(), lr=parameters['learning_rate'], correct_bias=False)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                         num_warmup_steps=parameters['num_warmup_steps'],
                                                         num_training_steps=parameters['num_training_steps'])


In [24]:
def evaluate(batch):
    input_ids, predictions, true_labels, attentions = [], [], [], []
    model.eval()
    for i, batch_cpu in enumerate(batch):
        batch_gpu = (t.to(device) for t in batch_cpu)
        input_ids_gpu, attention_mask, labels = batch_gpu
        with torch.no_grad():
            loss, logits, hidden_states_output, attention_mask_output = model(input_ids=input_ids_gpu, attention_mask=attention_mask, labels=labels)
            logits =  logits.cpu()
            prediction = torch.argmax(logits, dim=1).tolist()
            true_label = labels.cpu().tolist()
            input_ids_cpu = input_ids_gpu.cpu().tolist()
            attention_last_layer = attention_mask_output[-1].cpu() # selection the last attention layer
            attention_softmax = attention_last_layer[:,-1, 0].tolist()  # selection the last head attention of CLS token
            input_ids += input_ids_cpu
            predictions += prediction
            true_labels += true_label
            attentions += attention_softmax
    return input_ids, predictions, true_labels, attentions

def get_length_without_special_tokens(sentence):
    length = 0
    for i in sentence:
        if i == 0:
            break
        else:
            length += 1
    return length

def print_attention(input_ids_all, attentions_all, tokenizer):
    for input_ids, attention in zip(input_ids_all, attentions_all):
        html = []
        len_input_ids = get_length_without_special_tokens(input_ids)
        input_ids = input_ids[:len_input_ids]
        attention = attention[:len_input_ids]
        for input_id, attention_value in zip(input_ids, attention):
            token = tokenizer.convert_ids_to_tokens(input_id)
            attention_value = attention_value
            html.append('<span style="background-color: rgb(255,255,0,{0})">{1}</span>'.format(10 * attention_value, token))
        html_string = " ".join(html)
        display(HTML(html_string))

In [14]:
train_model(batch_train, model, optimizer, scheduler, epochs)

loss - 0.7048430442810059
loss - 0.6011022329330444
loss - 0.3259471654891968
loss - 0.09624850004911423
loss - 0.33943769335746765
loss - 0.1105036810040474
loss - 0.013818517327308655
loss - 0.45614898204803467
loss - 0.2383934110403061


In [26]:
def print_results(batch_dev):
    results = evaluate(batch_dev)
    input_ids, predictions, true_labels, attentions = results
    print(classification_report(true_labels, predictions))

## Evaluation for Stanford treebank

In [27]:
print_results(batch_dev)

              precision    recall  f1-score   support

           0       0.94      0.85      0.89       428
           1       0.87      0.95      0.91       444

    accuracy                           0.90       872
   macro avg       0.90      0.90      0.90       872
weighted avg       0.90      0.90      0.90       872



ValueError: too many values to unpack (expected 6)

In [31]:
def save():
    output_dir = './output'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print("Saving model to {}".format(output_dir))
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [108]:
save()

Saving model to {0} ./outputs_reddit_lr=2e-05_epochs=2
