In [1]:
from collections import defaultdict
import time
import random
import torch
import numpy as np

In [23]:
class CNNClass(torch.nn.Module):
    def __init__(self, nwords, emb_size, num_filters, window_size, ntags):
        super(CNNClass, self).__init__()
        
        self.embedding = torch.nn.Embedding(nwords, emb_size)
        torch.nn.init.uniform_(self.embedding.weight, -0.25, 0.25)
        
        self.conv_1d = torch.nn.Conv1d(
            in_channels=emb_size,
            out_channels=num_filters,
            kernel_size=window_size,
            stride=1,
            padding=0,
            dilation=1,
            groups=1,
            bias=True)
        
        self.relu = torch.nn.ReLU()
        
        self.projection_layer = torch.nn.Linear(
            in_features=num_filters,
            out_features=ntags,
            bias=True)
        
        torch.nn.init.xavier_uniform_(self.projection_layer.weight)
    
    def forward(self, words, return_activations=False):
        emb = self.embedding(words) # nwords x emb_size
        
        # Use `permute` to switch channel positions
        emb = emb.unsqueeze(0).permute(0, 2, 1) # 1 x emb_size x n_words
        
        h = self.conv_1d(emb) # 1 x num_filters x nwords
        
        # Argmax along length of the sentence
        activations = h.squeeze(0).max(dim=1)[1]
        
        # Do max pooling
        h = h.max(dim=2)[0]
        h = self.relu(h)
        features = h.squeeze(0)
        out = self.projection_layer(h) # 1 x ntags
        
        if return_activations:
            return out, activations.data.cpu().numpy(), features.data.cpu().numpy()
        return out 

In [24]:
import sys 
np.set_printoptions(linewidth=sys.maxsize, threshold=sys.maxsize)

# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            words = words.split(" ")
            yield (words, [w2i[x] for x in words], int(tag))


# Read in the data
train = list(read_dataset("../data/classes/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/classes/test.txt"))
nwords = len(w2i)
ntags = 5

In [25]:
# Define the model
EMB_SIZE = 10
WIN_SIZE = 3
FILTER_SIZE = 8

# initialize the model
model = CNNClass(nwords, EMB_SIZE, FILTER_SIZE, WIN_SIZE, ntags)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [26]:
data_type = torch.LongTensor
use_cuda = torch.cuda.is_available()

if use_cuda:
    data_type = torch.cuda.LongTensor
    model.cuda()

In [27]:
def calc_predict_and_activations(wids, tag, words):
    
    
    def display_activations(words, activations):
        pad_begin = (WIN_SIZE - 1) / 2
        pad_end = WIN_SIZE - 1 - pad_begin 
        words_padded = ["pad"] * int(pad_begin) + words + ["pad"] * int(pad_end)
        
        ngrams = []
        for act in activations:
            act = int(act)
            ngrams.append(
                "[" + 
                ", ".join(words_padded[act:act+WIN_SIZE]) 
                + "]") 
        return ngrams
            
    
    # Padding
    if len(wids) < WIN_SIZE:
        wids += [0] * (WIN_SIZE-len(wids))
        
    words_tensor = torch.tensor(wids).type(data_type)
    scores, activations, features = model(
        words_tensor, return_activations=True)
    scores = scores.squeeze().cpu().data.numpy()
    
    print('%d ||| %s' % (tag, ' '.join(words)))
    predict = np.argmax(scores)
    
    print(display_activations(words, activations))
    W = model.projection_layer.weight.data.cpu().numpy() # Weight matrix
    bias = model.projection_layer.bias.data.cpu().numpy()
    
    print("scores={}, predict: {}".format(scores, predict))
    print('  bias={}'.format(bias))
    
    contributions = W * features
    print(' very bad (%.4f): %s' % (scores[0], contributions[0]))
    print('      bad (%.4f): %s' % (scores[1], contributions[1]))
    print('  neutral (%.4f): %s' % (scores[2], contributions[2]))
    print('     good (%.4f): %s' % (scores[3], contributions[3]))
    print('very good (%.4f): %s' % (scores[4], contributions[4]))

In [28]:
for ITER in range(1):
    random.shuffle(train)
    
    train_loss, train_correct = 0.0, 0.0
    start = time.time()
    model.train()
    
    for _, wids, tag in train[:5000]:
        if len(wids) < WIN_SIZE:
            wids += [0] * (WIN_SIZE - len(wids))
        words_tensor = torch.tensor(wids).type(data_type)
        tag_tensor = torch.tensor([tag]).type(data_type)
        scores = model(words_tensor)
        predict = scores[0].argmax().item()
        if predict == tag:
            train_correct += 1

        my_loss = criterion(scores, tag_tensor)
        train_loss += my_loss.item()
        # Do back-prop
        optimizer.zero_grad()
        my_loss.backward()
        optimizer.step()
    
    print("iter %r: train loss/sent=%.4f, acc=%.4f, time=%.2fs" % (ITER, train_loss/len(train), train_correct/len(train), time.time()-start))
    
    # Testing
    test_correct = 0.0
    for _, wids, tag in dev[:5000]:
        # Padding (can be done in the conv layer as well)
        if len(wids) < WIN_SIZE:
            wids += [0] * (WIN_SIZE - len(wids))
        words_tensor = torch.tensor(wids).type(data_type)
        scores = model(words_tensor)
        predict = scores[0].argmax().item()
        if predict == tag:
            test_correct += 1
            
    print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))

iter 0: train loss/sent=0.9165, acc=0.1635, time=16.64s
iter 0: test acc=0.2308


In [30]:
for words, wids, tag in dev[:10]:
    calc_predict_and_activations(wids, tag, words)
    print("")

2 ||| effective but too-tepid biopic
['[effective, but, too-tepid]', '[pad, effective, but]', '[pad, effective, but]', '[effective, but, too-tepid]', '[pad, effective, but]', '[pad, effective, but]', '[effective, but, too-tepid]', '[pad, effective, but]']
scores=[-0.08909433  0.28088307  0.3214271   0.6062926  -0.15508379], predict: 3
  bias=[ 0.05086858  0.16146143 -0.0737052   0.16508259  0.01069979]
 very bad (-0.0891): [ 0.0143822   0.          0.005133    0.          0.01145995  0.          0.         -0.17093806]
      bad (0.2809): [-0.1514776  -0.          0.1721953   0.          0.04248144  0.         -0.          0.05622251]
  neutral (0.3214): [ 0.19533098 -0.          0.02746848 -0.          0.08763424  0.          0.          0.0846986 ]
     good (0.6063): [ 0.08825286 -0.          0.10560799 -0.          0.13073145 -0.         -0.          0.11661772]
very good (-0.1551): [-0.15279694  0.          0.03289231  0.          0.02185056  0.         -0.         -0.0677295 ]

3