In [1]:
import os
import pandas as pd
import torch
import transformers
import sklearn
from transformers import BertTokenizer,BertForSequenceClassification
from IPython.core.display import display, HTML

In [2]:
dataset = {
            "name": "Stanford treebank",
            "train_path": "data/train.csv",
            "dev_path": "data/dev.csv",
            "test_path": "data/test.csv",
            'classes': ['neg', 'pos']
          }
def read_data():
    train = pd.read_csv(dataset['train_path'], sep='\t')
    dev = pd.read_csv(dataset['dev_path'], sep='\t')
    test = pd.read_csv(dataset['test_path'], '\t')
    return train, dev, test
train, dev, test = read_data()

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
def encode(data, tokenizer):
    input_ids = []
    attention_mask = []
    for text in data:
        tokenized_text = tokenizer.encode_plus(text,
                                            max_length=128,
                                            add_special_tokens =          True,
                                            pad_to_max_length=True,
                                            padding_side='right',
                                            return_attention_mask=True)
        input_ids.append(tokenized_text['input_ids'])
        attention_mask.append(tokenized_text['attention_mask'])
    
    return torch.tensor(input_ids, dtype=torch.long), torch.tensor(attention_mask, dtype=torch.long)

In [5]:
def get_batches(df, tokenizer, batch_size=2):
    x = list(df['text'].values)
    
    y_indices = df['classification'].apply(lambda each_y: dataset['classes'].index(each_y))
    
    y = torch.tensor(list(y_indices), dtype=torch.long)
    input_ids, attention_mask = encode(x, tokenizer)
    tensor_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, y)
    tensor_randomsampler = torch.utils.data.RandomSampler(tensor_dataset)
    tensor_dataloader = torch.utils.data.DataLoader(tensor_dataset, sampler=tensor_randomsampler, batch_size=batch_size)
    return tensor_dataloader

In [6]:
batch_train = get_batches(train, tokenizer, batch_size=2)
batch_dev = get_batches(dev, tokenizer, batch_size=2)
batch_test = get_batches(test, tokenizer, batch_size=2)

In [7]:
def train_model(batch, model, optimizer, scheduler, epochs, device):
    model.train()  # Set the mode to training
    for e in range(epochs):
        for i, batch_tuple in enumerate(batch):
            batch_tuple = (t.to(device) for t in batch_tuple)
            input_ids, attention_mask, labels = batch_tuple
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits, hidden_states_output, attention_mask_output = outputs
            if i % 100 == 0:
                print("loss - {0}, iteration - {1}/{2}".format(loss, e + 1, i))
            model.zero_grad()
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 
parameters['max_grad_norm'])
            optimizer.step()
            scheduler.step()

In [8]:
def evaluate(batch, model, device):
    input_ids, predictions, true_labels, attentions = [], [], [], []
    model.eval()
    for i, batch_cpu in enumerate(batch):
        batch_gpu = (t.to(device) for t in batch_cpu)
        input_ids_gpu, attention_mask, labels = batch_gpu
        with torch.no_grad():
            loss, logits, hidden_states_output, attention_mask_output = model(input_ids=input_ids_gpu, attention_mask=attention_mask, labels=labels)
            logits =  logits.cpu()
            prediction = torch.argmax(logits, dim=1).tolist()
            true_label = labels.cpu().tolist()
            input_ids_cpu = input_ids_gpu.cpu().tolist()
            attention_last_layer = attention_mask_output[-1].cpu() # selection the last attention layer
            attention_softmax = attention_last_layer[:,-1, 0].tolist()  # selection the last head attention of CLS token
            input_ids += input_ids_cpu
            predictions += prediction
            true_labels += true_label
            attentions += attention_softmax
    return input_ids, predictions, true_labels, attentions

In [9]:
epochs=2
parameters = {
    'learning_rate': 2e-5,
    'num_warmup_steps': 1000,
    'num_training_steps': len(batch_train) * epochs,
    'max_grad_norm': 1
}
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_hidden_states=True, output_attentions=True)
model.to(device)
optimizer = transformers.AdamW(model.parameters(), 
lr=parameters['learning_rate'], correct_bias=False)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                         num_warmup_steps=parameters['num_warmup_steps'],
                                                         num_training_steps=parameters['num_training_steps'])

In [10]:
train_model(batch_train, model, optimizer, scheduler, epochs, device)

loss - 0.7885009050369263, iteration - 1/0
loss - 0.6828967332839966, iteration - 1/100
loss - 0.47977882623672485, iteration - 1/200
loss - 0.30044764280319214, iteration - 1/300
loss - 1.9336867332458496, iteration - 1/400
loss - 0.7013939619064331, iteration - 1/500
loss - 2.3338422775268555, iteration - 1/600
loss - 0.020357787609100342, iteration - 1/700
loss - 0.0033212900161743164, iteration - 1/800
loss - 0.008709549903869629, iteration - 1/900
loss - 0.029690444469451904, iteration - 1/1000
loss - 0.009381532669067383, iteration - 1/1100
loss - 2.4084439277648926, iteration - 1/1200
loss - 2.4525928497314453, iteration - 1/1300
loss - 0.0031393766403198242, iteration - 1/1400
loss - 0.006591796875, iteration - 1/1500
loss - 0.0012960433959960938, iteration - 1/1600
loss - 0.003993034362792969, iteration - 1/1700
loss - 0.001434922218322754, iteration - 1/1800
loss - 0.003718137741088867, iteration - 1/1900
loss - 0.5532255172729492, iteration - 1/2000
loss - 0.0035979747772216

In [11]:
input_ids, predictions, true_labels, attentions = evaluate(batch_dev, model, device)

In [12]:
print(sklearn.metrics.classification_report(true_labels, predictions))

              precision    recall  f1-score   support

           0       0.92      0.93      0.92       428
           1       0.93      0.92      0.93       444

    accuracy                           0.92       872
   macro avg       0.92      0.92      0.92       872
weighted avg       0.92      0.92      0.92       872



In [13]:
def get_length_without_special_tokens(sentence):
    length = 0
    for i in sentence:
        if i == 0:
            break
        else:
            length += 1
    return length
def print_attention(input_ids_all, attentions_all, tokenizer):
    for input_ids, attention in zip(input_ids_all, attentions_all):
        html = []
        len_input_ids = get_length_without_special_tokens(input_ids)
        input_ids = input_ids[:len_input_ids]
        attention = attention[:len_input_ids]
        for input_id, attention_value in zip(input_ids, attention):
            token = tokenizer.convert_ids_to_tokens(input_id)
            attention_value = attention_value
            html.append('<span style="background-color: rgb(255,255,0,{0})">{1}</span>'.format(10 * attention_value, token))
        html_string = " ".join(html)
        display(HTML(html_string))
print_attention(input_ids, attentions, tokenizer)

In [14]:
def save(model, tokenizer):
    output_dir = './output'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print("Saving model to {}".format(output_dir))
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
save(model, tokenizer)

Saving model to ./output
