In [46]:
from transformers import BertModel, BertTokenizer, BertConfig, BertForSequenceClassification
import os
import pandas as pd
import torch
import transformers
import time
import datetime
import numpy as np
import random
import os
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from IPython.core.display import display, HTML

In [11]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dataset = { "name": "Stanford treebank",
            "prefix": "stanford_treebank",
            "train_path": "/data/sam/stanford_treebank/sst_train.csv",
            "dev_path": "/data/sam/stanford_treebank/sst_dev.csv",
            "test_path": "/data/sam/stanford_treebank/sst_test.csv",
            'classes': ['neg', 'pos']
          }

In [29]:
def read_data():
    train = pd.read_csv(dataset['train_path'],index_col=0)
    dev = pd.read_csv(dataset['dev_path'],index_col=0)
    test = pd.read_csv(dataset['test_path'],index_col=0)
    return train, dev, test
train, dev, test = read_data()

In [30]:
def encode(data, tokenizer):
    input_ids = []
    attention_mask = []
    for text in data:
        tokenized_text = tokenizer.encode_plus(text,
                                            max_length=128,
                                            add_special_tokens = True,
                                            pad_to_max_length=True,
                                            padding_side='right',
                                            return_attention_mask=True)
        input_ids.append(tokenized_text['input_ids'])
        attention_mask.append(tokenized_text['attention_mask'])
    return torch.tensor(input_ids, dtype=torch.long), torch.tensor(attention_mask, dtype=torch.long)

In [31]:
def get_batches(df, tokenizer, batch_size=4):
    x, y = list(df['text'].values), list(df['classification'].apply(lambda y: dataset['classes'].index(y)))
    input_ids, attention_mask = encode(x, tokenizer)
    y = torch.tensor(y)
    tensor_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, y)
    tensor_randomsampler = torch.utils.data.RandomSampler(tensor_dataset)
    tensor_dataloader = torch.utils.data.DataLoader(tensor_dataset, sampler=tensor_randomsampler, batch_size=batch_size)
    return tensor_dataloader

In [32]:
def train_model(batch, model, optimizer, scheduler, epochs):
    model.train()  # Set the mode to training
    for e in range(epochs):
        for i, batch_tuple in enumerate(batch):
            batch_tuple = (t.to(device) for t in batch_tuple)
            input_ids, attention_mask, labels = batch_tuple
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits, hidden_states_output, attention_mask_output = outputs
            if i % 100 == 0:
                print("loss - {0}".format(loss))
            model.zero_grad()        
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), parameters['max_grad_norm'])
            optimizer.step()
            scheduler.step()

In [33]:
#batch_dev = get_batches(dev, batch_size=1, tokenizer=tokenizer)
#batch_train = get_batches(train_data, batch_size=8, tokenizer=tokenizer)
#batch_test = get_batches(test, batch_size=1, tokenizer=tokenizer)

In [34]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
batch_dev = get_batches(dev, batch_size=1, tokenizer=tokenizer)
batch_train = get_batches(train, batch_size=8, tokenizer=tokenizer)
batch_test = get_batches(test, batch_size=1, tokenizer=tokenizer)

epochs=1
parameters = {
    'learning_rate': 2e-5,
    'num_warmup_steps': 1000,
    'num_training_steps': len(batch_train) * epochs,
    'max_grad_norm': 1
}
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_hidden_states=True, output_attentions=True)
model.to(device)
optimizer = transformers.AdamW(model.parameters(), lr=parameters['learning_rate'], correct_bias=False)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                         num_warmup_steps=parameters['num_warmup_steps'],
                                                         num_training_steps=parameters['num_training_steps'])


In [40]:
def evaluate(batch):
    input_ids, predictions, true_labels, attentions = [], [], [], []
    model.eval()
    for i, batch_cpu in enumerate(batch):
        batch_gpu = (t.to(device) for t in batch_cpu)
        input_ids_gpu, attention_mask, labels = batch_gpu
        with torch.no_grad():
            loss, logits, hidden_states_output, attention_mask_output = model(input_ids=input_ids_gpu, attention_mask=attention_mask, labels=labels)
            logits =  logits.cpu()
            prediction = torch.argmax(logits, dim=1).tolist()
            true_label = labels.cpu().tolist()
            input_ids_cpu = input_ids_gpu.cpu().tolist()
            attention_last_layer = attention_mask_output[-1].cpu() # selection the last attention layer
            attention_softmax = attention_last_layer[:,-1, 0].tolist()  # selection the last head attention of CLS token
            input_ids += input_ids_cpu
            predictions += prediction
            true_labels += true_label
            attentions += attention_softmax
    return input_ids, predictions, true_labels, attentions

In [37]:
train_model(batch_train, model, optimizer, scheduler, epochs)

loss - 0.777716338634491
loss - 0.7429631948471069
loss - 0.4218112528324127
loss - 0.24879083037376404
loss - 0.18006247282028198
loss - 1.496911883354187
loss - 0.9453505277633667
loss - 0.8254987001419067
loss - 0.01626703143119812


In [38]:
def get_prediction(results):
    predictions = []
    true_labels = []
    count = 0
    for result in results:
        loss, logits, labels, input_ids, hidden_states_output, attention_mask_output = result
        prediction = torch.argmax(logits, dim=1).tolist()
        if prediction[0] != labels[0]:
            count += 1
        predictions += prediction
        true_labels += labels.tolist()
    print("wrong: {}".format(count))
    return true_labels, predictions

## Evaluation for Reddit data

In [41]:
results = evaluate(batch_dev)


In [43]:
input_ids, predictions, true_labels, attentions = results

In [45]:
print(classification_report(true_labels, predictions))


              precision    recall  f1-score   support

           0       0.95      0.82      0.88       428
           1       0.85      0.95      0.90       444

    accuracy                           0.89       872
   macro avg       0.90      0.89      0.89       872
weighted avg       0.90      0.89      0.89       872



In [58]:
def get_length_without_special_tokens(sentence):
    length = 0
    for i in sentence:
        if i == 0:
            break
        else:
            length += 1
    return length

def print_attention(input_ids_all, attentions_all, tokenizer):
    for input_ids, attention in zip(input_ids_all, attentions_all):
        html = []
        len_input_ids = get_length_without_special_tokens(input_ids)
        input_ids = input_ids[:len_input_ids]
        attention = attention[:len_input_ids]
        for input_id, attention_value in zip(input_ids, attention):
            token = tokenizer.convert_ids_to_tokens(input_id)
            attention_value = attention_value
            html.append('<span style="background-color: rgb(255,255,0,{0})">{1}</span>'.format(10 * attention_value, token))
        html_string = " ".join(html)
        display(HTML(html_string))

In [59]:
print_attention(input_ids, attentions, tokenizer)

In [102]:
results = evaluate(batch_test)
y_true, y_pred = get_prediction(results)
target_names = ["negative", "positive"]
classification = classification_report(y_true, y_pred, target_names=target_names, digits=4)
print(classification)

[1] [0] [CLS] not to sound racist but seeing black people portrayed this was makes me so happy than how the news portrays black people [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[1] [0] [CLS] lol what a shit [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [31]:
def save():
    output_dir = './output'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print("Saving model to {}".format(output_dir))
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [108]:
save()

Saving model to {0} ./outputs_reddit_lr=2e-05_epochs=2


## For Stanford Treebank

In [25]:
results = evaluate(batch_dev)
y_true, y_pred = get_prediction(results)
target_names = ["negative", "positive"]
classification = classification_report(y_true, y_pred, target_names=target_names, digits=4)
print(classification)

wrong: 79
              precision    recall  f1-score   support

    negative     0.9051    0.9136    0.9093       428
    positive     0.9159    0.9077    0.9118       444

    accuracy                         0.9106       872
   macro avg     0.9105    0.9106    0.9105       872
weighted avg     0.9106    0.9106    0.9106       872



In [26]:
results = evaluate(batch_test)
y_true, y_pred = get_prediction(results)
target_names = ["negative", "positive"]
classification = classification_report(y_true, y_pred, target_names=target_names, digits=4)
print(classification)

wrong: 152
              precision    recall  f1-score   support

    negative     0.9242    0.9090    0.9165       912
    positive     0.9102    0.9252    0.9176       909

    accuracy                         0.9171      1821
   macro avg     0.9172    0.9171    0.9171      1821
weighted avg     0.9172    0.9171    0.9171      1821



In [32]:
save()

Saving model to ./outputs_stanford_treebank_lr=2e-05_epochs=2


In [37]:
train_data

Unnamed: 0,id,original_text,text,classification,rationale
0,0,The Rock is destined to be the 21st Century 's...,the rock is destined to be the 21st century 's...,pos,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,The gorgeously elaborate continuation of `` Th...,the gorgeously elaborate continuation of `` th...,pos,"[0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,Singer/composer Bryan Adams contributes a slew...,singer/composer bryan adams contributes a slew...,pos,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4,Yet the act is still charming here .,yet the act is still charming here .,pos,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
4,5,Whether or not you 're enlightened by any of D...,whether or not you 're enlightened by any of d...,pos,"[0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
6915,8539,A real snooze .,a real snooze .,neg,"[0.0, 0.5, 1.0, 0.0]"
6916,8540,No surprises .,no surprises .,neg,"[0.5, 0.5, 0.0]"
6917,8541,We 've seen the hippie-turned-yuppie plot befo...,we 've seen the hippie-turned-yuppie plot befo...,pos,"[0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, ..."
6918,8542,Her fans walked out muttering words like `` ho...,her fans walked out muttering words like `` ho...,neg,"[0.0, 0.5, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


In [38]:
dev

Unnamed: 0,id,original_text,text,classification,rationale
0,0,It 's a lovely film with lovely performances b...,it 's a lovely film with lovely performances b...,pos,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
1,2,And if you 're not nearly moved to tears by a ...,and if you 're not nearly moved to tears by a ...,pos,"[0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 1.0, 1.0, 1.0, ..."
2,3,"A warm , funny , engaging film .","a warm , funny , engaging film .",pos,"[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0]"
3,4,Uses sharp humor and insight into human nature...,uses sharp humor and insight into human nature...,pos,"[0.0, 0.5, 1.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, ..."
4,6,"Entertains by providing good , lively company .","entertains by providing good , lively company .",pos,"[0.5, 0.0, 0.0, 0.5, 0.0, 0.5, 0.0, 0.0]"
...,...,...,...,...,...
867,1095,... Designed to provide a mix of smiles and te...,... designed to provide a mix of smiles and te...,neg,"[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
868,1096,it seems to me the film is about the art of ri...,it seems to me the film is about the art of ri...,neg,"[0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
869,1097,It 's just disappointingly superficial -- a mo...,it 's just disappointingly superficial -- a mo...,neg,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
870,1098,The title not only describes its main characte...,the title not only describes its main characte...,neg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [39]:
test

Unnamed: 0,id,original_text,text,classification,rationale
0,1,If you sometimes like to go to the movies to h...,if you sometimes like to go to the movies to h...,pos,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, ..."
1,2,"Emerges as something rare , an issue movie tha...","emerges as something rare , an issue movie tha...",pos,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4,Offers that rare combination of entertainment ...,offers that rare combination of entertainment ...,pos,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]"
3,5,Perhaps no picture ever made has more literall...,perhaps no picture ever made has more literall...,pos,"[0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, ..."
4,6,Steers turns in a snappy screenplay that curls...,steers turns in a snappy screenplay that curls...,pos,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
1816,2205,An imaginative comedy/thriller .,an imaginative comedy/thriller .,pos,"[0.0, 1.0, 0.5, 0.5]"
1817,2206,"( A ) rare , beautiful film .","( a ) rare , beautiful film .",pos,"[0.5, 0.0, 0.5, 0.5, 0.0, 1.0, 0.0, 0.0]"
1818,2207,( An ) hilarious romantic comedy .,( an ) hilarious romantic comedy .,pos,"[0.5, 0.0, 0.5, 1.0, 0.0, 0.5, 0.0]"
1819,2208,Never ( sinks ) into exploitation .,never ( sinks ) into exploitation .,pos,"[0.5, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0]"


In [None]:
a = torch.tensor([[1,2,4], [3,4,5]])