In [1]:
!pip install captum

Collecting captum
[?25l  Downloading https://files.pythonhosted.org/packages/13/b0/8fa3ab89e2e37c960cdd09595fa911fbb8d6da216c8bc98e18c858a0128d/captum-0.3.1-py3-none-any.whl (4.4MB)
[K     |████████████████████████████████| 4.4MB 10.6MB/s 
Installing collected packages: captum
Successfully installed captum-0.3.1


In [48]:
import captum

import spacy

import torch
import torchtext
import torchtext.data
import torch.nn as nn
import torch.nn.functional as F
from torchtext.legacy.data import BucketIterator
from torchtext.vocab import GloVe
from torchtext.legacy.data import Field, LabelField
from torchtext.legacy.datasets import SST
import torch.optim as optim


from torchtext.vocab import Vocab

from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

nlp = spacy.load('en')

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [56]:
def preprocess_labels(score):
  print(score)
  if score >= 0 and score < 0.2:
      label = 0
  elif score >= 0.2 and score < 0.4:
      label = 1
  elif score >= 0.4 and score < 0.6:
      label = 2
  elif score >=0.6 and score < 0.8:
      label = 3
  else:
      label = 4
  return label 

In [106]:
def get_sst_data(device):
    # set up fields
    #TEXT = Field(lower=True, include_lengths=True, batch_first=True)
    #LABEL =Field(sequential=False)
    TEXT = Field(lower=True, tokenize='spacy', batch_first=True)
    Label = LabelField(dtype = torch.float, use_vocab=True, preprocessing=float)


    # make splits for data
    train, val, test = SST.splits(TEXT, Label)

    # build the vocabulary
   
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=100), max_size=len(loaded_vectors.stoi))
    TEXT.vocab.set_vectors(stoi=loaded_vectors.stoi, vectors=loaded_vectors.vectors, dim=loaded_vectors.dim)

    Label.build_vocab(train)
    text_vocab = TEXT.vocab
    x = torch.tensor(1)

    # make iterator for splits
    train_iter, val_iter, test_iter = BucketIterator.splits(
        (train, val, test), batch_size=32, device=torch.device('cuda:0')) # for CPU, device = -1

    return train_iter, val_iter, test_iter, text_vocab

In [107]:
train_iter, val_iter, test_iter, vocab = get_sst_data(device)

In [152]:
class TextCNN(nn.Module):
    def __init__(self, vocab,  embedding_dim, n_filters, num_classes, dropout, pad_idx = 0, window_sizes=(1,2,3,5)):
        super(TextCNN, self).__init__()
        # load pretrained embedding in embedding layer.
        self.embedding = nn.Embedding(len(vocab), embedding_dim, padding_idx = pad_idx)
        #self.embedding.weight.data.copy_(vocab.vectors)

        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(window_size, embedding_dim), padding=(window_size - 1, 0))
            for window_size in window_sizes
        ])

        self.fc = nn.Linear(n_filters * len(window_sizes), num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, max_sent_len=49):
        #text_emb = text_emb.unsqueeze(1).unsqueeze(2)
        #text_emb = text_emb.expand(-1,1,max_sent_len,-1).to(device)

        #text = [batch size, sent len]
        
        text_emb = self.embedding(text)

        # text_emb = [batch size, sent len, emb dim]

        text_emb = text_emb.unsqueeze(1)

        # text_emb = [batch size, 1, sent len, emb dim]

        conved = [F.relu(conv(text_emb)).squeeze(3) for conv in self.convs]
        # conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        # pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat = [batch size, n_filters * len(filter_sizes)]
        return self.fc(cat)


In [127]:
model = TextCNN(vocab, embedding_dim=100, n_filters=100, num_classes=5, dropout=0.5, window_sizes=(2, 3, 4), pad_idx=0)

In [115]:
def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))

In [102]:

#train, val, test = SST.splits(text_field=TEXT, label_field=Label, fine_grained=True)
#test, _ = test_iter.split(split_ratio = 0.04)

In [61]:
# from torchtext import vocab

# #loaded_vectors = vocab.GloVe(name='6B', dim=100)

# # If you prefer to use pre-downloaded glove vectors, you can load them with the following two command line
# loaded_vectors = torchtext.vocab.Vectors('glove.6B.100d.txt')
# TEXT.build_vocab(train, vectors=loaded_vectors, max_size=len(loaded_vectors.stoi))
    
# TEXT.vocab.set_vectors(stoi=loaded_vectors.stoi, vectors=loaded_vectors.vectors, dim=loaded_vectors.dim)
# Label.build_vocab(train)

In [110]:
for key in Label.vocab.freqs:
  print(key, Label.vocab.stoi[key])

positive 0
very positive 3
neutral 2
negative 1
very negative 4


In [150]:
def train(model, iterator, optimizer, criterion, device):
    epoch_loss = 0
    epoch_acc = 0

    model.train() # sets the training mode

    for batch in iterator:
        optimizer.zero_grad()

        predictions = model(batch.text)
        true_labels = batch.label.long()
      
        #true_labels = torch.sub(batch.label, torch.tensor(1))
        #true_labels = true_labels.to(device)

        # Calc loss
        loss = criterion(predictions, true_labels)
        acc = categorical_accuracy(predictions, true_labels)

        # Calcualte train and validation losses after 10 episodes
        # Backprop step
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc/len(iterator), model


def evaluate(model, iterator, criterion, device, split='val'):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    #num_iterations = 0
    with torch.no_grad():
        #for emb, labels in get_batched_emb_labels(args, split=split):
        for batch in iterator:
            predictions = model(batch.text)
            true_labels = batch.label.long()
            
            #true_labels = torch.sub(batch.label, torch.tensor(1))
            
            #true_labels = true_labels.to(device)

            loss = criterion(predictions, true_labels)
            acc = categorical_accuracy(predictions, true_labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            #num_iterations += 1

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [91]:
print('Vocabulary Size: ', len(TEXT.vocab))


Vocabulary Size:  15480


In [92]:
PAD_IND = TEXT.vocab.stoi['pad']


In [93]:
PAD_IND

12760

In [94]:
def categorical_accuracy(preds, y):
    """
    We calculate the accuracy by performing an argmax to get the index of the maximum value in the prediction for
    each element in the batch, and then counting how many times this equals the actual label.
    We then average this across the batch.
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [151]:
model = model.to("cuda:0")
optimizer = optim.Adam(model.parameters(), lr=float(1e-4))
loss_fn = nn.CrossEntropyLoss()  # nn.NLLLoss()
# CrossEntropyLoss expects the input to be [batch size, n classes] and the label to be [batch size].
loss_fn.to(device)

#SST.iters(batch_size=32, device=None) # change device to -1 if using cpu

for epoch in range(3):

    train_loss, train_acc, model = train(model, train_iter, optimizer, loss_fn, device)
    # calculate validation loss and accuracy
    valid_loss, valid_acc = evaluate(model, val_iter, loss_fn, device, split='val')

    print(f'Epoch: {epoch + 1} ')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')
torch.save(model.state_dict(), f"cnn_model_3.pt")

test_loss, test_acc = evaluate(model, test_iter, loss_fn, device, split='test')
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')


torch.Size([32, 35])
torch.Size([32, 1, 35, 100])
torch.Size([32, 42])
torch.Size([32, 1, 42, 100])
torch.Size([32, 45])
torch.Size([32, 1, 45, 100])
torch.Size([32, 36])
torch.Size([32, 1, 36, 100])
torch.Size([32, 45])
torch.Size([32, 1, 45, 100])
torch.Size([32, 35])
torch.Size([32, 1, 35, 100])
torch.Size([32, 47])
torch.Size([32, 1, 47, 100])
torch.Size([32, 38])
torch.Size([32, 1, 38, 100])
torch.Size([32, 41])
torch.Size([32, 1, 41, 100])
torch.Size([32, 47])
torch.Size([32, 1, 47, 100])
torch.Size([32, 38])
torch.Size([32, 1, 38, 100])
torch.Size([32, 46])
torch.Size([32, 1, 46, 100])
torch.Size([32, 37])
torch.Size([32, 1, 37, 100])
torch.Size([32, 49])
torch.Size([32, 1, 49, 100])
torch.Size([32, 42])
torch.Size([32, 1, 42, 100])
torch.Size([32, 48])
torch.Size([32, 1, 48, 100])
torch.Size([32, 33])
torch.Size([32, 1, 33, 100])
torch.Size([32, 34])
torch.Size([32, 1, 34, 100])
torch.Size([32, 46])
torch.Size([32, 1, 46, 100])
torch.Size([32, 44])
torch.Size([32, 1, 44, 100])


In [153]:
torch.load('cnn_model_3.pt')
model.eval()

TextCNN(
  (embedding): Embedding(15480, 100, padding_idx=0)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 100), stride=(1, 1), padding=(1, 0))
    (1): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1), padding=(2, 0))
    (2): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1), padding=(3, 0))
  )
  (fc): Linear(in_features=300, out_features=5, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [154]:
token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)


In [155]:
lig = LayerIntegratedGradients(model, model.embedding)
#lig = LayerIntegratedGradients(model, model.convs)

In [173]:
# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 30, true_label = 0):
    text = [tok.text for tok in nlp.tokenizer(sentence.lower())]
    if len(text) < min_len:
        text += ['pad'] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)

    #print(input_indices.shape)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    # predict
    pred = forward_with_sigmoid(input_indices) #.item()
    print(pred)
    score = torch.max(pred).item()
    print(score)
    if score >= 0 and score < 0.2:
      label = Label.vocab.stoi["very negative"]
    elif score >= 0.2 and score < 0.4:
      label = Label.vocab.stoi["negative"]
    elif score >= 0.4 and score < 0.6:
      label = Label.vocab.stoi["neutral"]
    elif score >=0.6 and score < 0.8:
      label = Label.vocab.stoi["positive"]
    else:
      label = Label.vocab.stoi["very positive"]
    pred_ind = label #round(pred)
    #print(label)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)
    #print(reference_indices)
    #print(input_indices)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, target=pred_ind, \
                                           n_steps=500, return_convergence_delta=True)
    
    #print(Label.vocab.itos[pred_ind])

    print(f"pred: {Label.vocab.itos[pred_ind]}, {label}, delta: {abs(delta)}")

    add_attributions_to_visualizer(attributions_ig, text, label, pred_ind, true_label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            Label.vocab.itos[pred_ind],
                            Label.vocab.itos[label],
                            Label.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [168]:
for key in Label.vocab.freqs:
  print(key, Label.vocab.stoi[key])

positive 0
very positive 3
neutral 2
negative 1
very negative 4


In [174]:
interpret_sentence(model, 'It was a fantastic performance !', true_label=3)
interpret_sentence(model, "Renner 's performance as Dahmer is unforgettable , deeply absorbing .", true_label=3)
interpret_sentence(model, 'Best film ever', true_label=3)
interpret_sentence(model, 'Too leisurely paced and visually drab for its own good , it succeeds in being only sporadically amusing .', true_label=2)
interpret_sentence(model, 'It was a horrible movie', true_label=1)
interpret_sentence(model, 'I\'ve never watched something as bad', true_label=4)
interpret_sentence(model, 'It is a disgusting movie!', true_label=4)

torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
tensor([[0.8090, 0.7518, 0.6020, 0.0027, 0.0011]], device='cuda:0',
       grad_fn=<SigmoidBackward>)
0.808981716632843
torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
torch.Size([500, 30])
torch.Size([500, 1, 30, 100])
torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
pred: very positive, 3, delta: tensor([0.0002], device='cuda:0', dtype=torch.float64)
torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
tensor([[8.2724e-01, 8.1589e-01, 6.8803e-01, 8.5741e-04, 3.3134e-04]],
       device='cuda:0', grad_fn=<SigmoidBackward>)
0.8272417187690735
torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
torch.Size([500, 30])
torch.Size([500, 1, 30, 100])
torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
torch.Size([1, 30])
torch.Size([1, 1, 30, 100])
pred: very positive, 3, delta: tensor([0.0003], device='cuda:0', dtyp

In [175]:
print('Visualize attributions based on Integrated Gradients')
_ = visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
very positive,very positive (3.00),negative,-2.44,it was a fantastic performance ! pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad
,,,,
very positive,very positive (3.00),negative,-3.14,"renner 's performance as dahmer is unforgettable , deeply absorbing . pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad"
,,,,
very positive,positive (0.00),negative,1.5,best film ever pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad
,,,,
neutral,very positive (3.00),negative,-3.66,"too leisurely paced and visually drab for its own good , it succeeds in being only sporadically amusing . pad pad pad pad pad pad pad pad pad pad pad"
,,,,
negative,very positive (3.00),negative,-2.21,it was a horrible movie pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad
,,,,
