In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

In [2]:
#create fields for labels and text

text_field = data.Field(tokenize = 'spacy')
label_field = data.LabelField(dtype = torch.float)

In [3]:
#use the imdb dataset and split it into testing and training

train_data, test_data = datasets.IMDB.splits(text_field, label_field)

In [4]:
#check size of both
print(f'Train size: {len(train_data)}')
print(f'Test size: {len(test_data)}')

Train size: 25000
Test size: 25000


In [5]:
#print an example
vars(train_data[1])

{'text': ['Misfits',
  'at',
  'a',
  'military',
  'school',
  '?',
  'Hmmmm',
  ',',
  'sounds',
  'funny',
  ',',
  'maybe',
  'offensive',
  'to',
  'some',
  '.',
  'You',
  'have',
  'the',
  'characters',
  'there',
  ',',
  'the',
  'Arab',
  'thief',
  ',',
  'the',
  'sex',
  'crazy',
  'teen',
  ',',
  'the',
  'smart',
  'mouth',
  ',',
  'the',
  'pot',
  'smoker',
  ',',
  'and',
  'not',
  'to',
  'forget',
  ',',
  'the',
  'guy',
  'who',
  'burns',
  'things',
  '.',
  'Throw',
  'in',
  'a',
  'strict',
  'no',
  'nonsense',
  'Sergent',
  ',',
  'a',
  'homosexual',
  'Sergent',
  'and',
  'one',
  'sexy',
  'ammunition',
  'teacher',
  'and',
  'it',
  'makes',
  'one',
  'crazy',
  'film',
  'adventure.<br',
  '/><br',
  '/>I',
  'have',
  'seen',
  'this',
  'film',
  'and',
  'it',
  'is',
  'funny',
  ',',
  'because',
  'the',
  'comedy',
  'is',
  'revolved',
  'around',
  'the',
  'fact',
  'that',
  'if',
  'you',
  'try',
  'to',
  'work',
  'together',
  

In [6]:
#split train into train and validation and print size

train_data, validation_data = train_data.split(split_ratio=0.8)

print(f'Train size: {len(train_data)}')
print(f'Validation size: {len(validation_data)}')

Train size: 20000
Validation size: 5000


In [7]:
vars(train_data[1])

{'text': ['This',
  'kind',
  'of',
  'storytelling',
  'is',
  'unacceptable',
  'The',
  'only',
  'reason',
  'this',
  'film',
  'is',
  'anywhere',
  'above',
  'the',
  '5',
  'stars',
  'out',
  'of',
  '10',
  'line',
  'is',
  'because',
  'it',
  "'s",
  'got',
  'George',
  'Lucas',
  'behind',
  'it',
  ',',
  'and',
  'it',
  'has',
  'the',
  'words',
  '"',
  'Star',
  '"',
  'and',
  '"',
  'Wars',
  '"',
  'in',
  'its',
  'title',
  '.',
  'That',
  'is',
  'an',
  'insult',
  'to',
  'aspiring',
  'filmmakers',
  ',',
  'and',
  'many',
  'others',
  'out',
  'there',
  'who',
  'have',
  'made',
  'clearly',
  'superior',
  'films',
  'with',
  'superior',
  'story',
  ',',
  'writing',
  'and',
  'acting',
  ',',
  'but',
  'did',
  'not',
  'get',
  'the',
  'credit',
  '.',
  'This',
  'is',
  'a',
  'travesty.<br',
  '/><br',
  '/>First',
  'things',
  'first',
  '.',
  'The',
  'story',
  '.',
  'Anakin',
  "'s",
  'evolution',
  '?',
  'There',
  'is',
  'none

In [8]:
#use glove pretrained embeddings
#use only top 25000 most commonly occurring words
text_field.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d")
label_field.build_vocab(train_data)

In [9]:
#how many tokens are there
len(text_field.vocab)

25002

In [10]:
#25 most common
text_field.vocab.freqs.most_common(25)

[('the', 230828),
 (',', 219882),
 ('.', 188816),
 ('and', 124865),
 ('a', 124835),
 ('of', 114902),
 ('to', 106683),
 ('is', 86951),
 ('in', 69989),
 ('I', 62012),
 ('it', 61133),
 ('that', 55926),
 ('"', 50642),
 ("'s", 49451),
 ('this', 48361),
 ('-', 41827),
 ('/><br', 40423),
 ('was', 39758),
 ('as', 34788),
 ('with', 34209),
 ('movie', 33840),
 ('for', 33370),
 ('film', 31061),
 ('The', 29999),
 ('but', 27973)]

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [12]:
batch = 128
#create iterators for train, test and validation sets

train_iter, validation_iter, test_iter = data.BucketIterator.splits((train_data, validation_data, test_data), batch_size=batch, device=device)

In [13]:
for b in train_iter:
    print(b.text)
    r,c = b.text.size()
    print(r,c)
    for i in range(r):
        print(text_field.vocab.itos[b.text[i][120]], end=" ")
    print("------------------------\n")
    break

tensor([[   66,   378,     0,  ...,    11,   378,  7466],
        [ 5963,     6, 13033,  ...,   141,   173,  2774],
        [  241,   361,    52,  ...,  4736,   288,     9],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]], device='cuda:0')
972 128
Plot : an amorous couple decide to engage in some extra - marital hijinks in a flashy car . They then become stuck ( literally ) in a <unk> <unk> , while said car wanders aimlessly about the countryside until the hapless couple are rescued by the <unk> /><br />That 's it . That 's the entire movie . There may have been some dialogue here and there , but nothing comes to mind . It should be obvious by now that this movie is not just pointless , but actually physically painful to watch . The fact that it starred two of the <unk> best up - and - coming actors ( one of whom is now sadly deceased ) only adds to

In [14]:
#architecture of network:
# embedding layer -> n x Conv Layers -> max pooling -> dropout -> fc layer


import torch.nn as nn
import torch.nn.functional as F

class cnn(nn.Module):
    def __init__(self, vocab_size, emb_dim, n_filters, filter_sizes, output_dim, dropout):
        super(cnn, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        
        self.filters = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, emb_dim)) 
                                      for fs in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(n_filters*len(filter_sizes), output_dim)
    
    def forward(self, x):
        #x: (word, batch)
        #conv2d wants batch as first dimension, so we switch x dims
        x = x.transpose(0,1)
        #x: (batch, word)
        x_emb = self.embedding(x).unsqueeze(1)
        #x_emb: batch, 1, word, emb_dim
        x_conv = [filt(x_emb) for filt in self.filters]
        #x_conv[0]: batch, n_filters, word, 1
        x_conv_relu = [F.relu(t).squeeze(3) for t in x_conv]
        #x_conv_relu[0] = batch, n_filters, word
        x_maxpool = [F.max_pool1d(input=t, kernel_size=t.size(2)).squeeze(2) for t in x_conv_relu]
        #x_maxpool[0] = batch, n_filters
        x_concat = torch.cat(x_maxpool, dim=1)
        #x_concat= batch, n_filter*n_filter
        x_drop = self.dropout(x_concat)
        return self.fc(x_drop)

In [15]:
dropout = 0.4
output_dim = 1
filters = [3,4,5]
n_filters = 100
emb_dim = 100
input_dim = len(text_field.vocab)

model = cnn(input_dim, emb_dim, n_filters, filters, output_dim, dropout)

In [16]:
pretrained_vect = text_field.vocab.vectors
#copy into model
model.embedding.weight.data.copy_(pretrained_vect)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1986,  0.3232,  0.7498,  ..., -0.4724,  0.8580,  0.4183],
        [-0.0482,  0.3399,  0.0066,  ...,  0.0626,  0.4746,  0.9363],
        [-0.0601, -0.1312, -0.0897,  ..., -0.4999,  0.9753,  0.5777]])

In [17]:
#define accuracy
def accuracy_score(y_true, y_pred):
    y_pred = torch.round(torch.sigmoid(y_pred))
    correct = (y_pred==y_true).float()
    return sum(correct)/len(correct)

In [18]:
#define optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
lossfunc = nn.BCEWithLogitsLoss().to(device)
model = model.to(device)

In [19]:
def train(model, iterator, optimizer, lossfunction):
    
    loss = 0.
    acc = 0.
    model.train()
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        y_pred = model(batch.text).squeeze(1)
        batch_loss = lossfunction(y_pred, batch.label)
        batch_acc = accuracy_score(batch.label, y_pred)
        batch_loss.backward()
        optimizer.step()
        loss += batch_loss.item()
        acc += batch_acc.item()
        
        if i>0 and i%20==0:
            print('Train batch', i, 'done.')
    return loss/len(iterator), acc/len(iterator)

In [20]:
def evaluate(model, iterator, lossfunction):
    
    loss = 0.
    acc = 0.
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            y_pred = model(batch.text).squeeze(1)
            batch_loss = lossfunction(y_pred, batch.label)
            batch_acc = accuracy_score(batch.label, y_pred)
            
            loss += batch_loss.item()
            acc += batch_acc.item()
            
            if i>0 and i%20==0:
                print('Evaluate batch', i, 'done')
    return loss/len(iterator), acc/len(iterator)

In [21]:
n_epochs = 5

for epoch in range(n_epochs):
    
    train_loss, train_acc = train(model, train_iter, optimizer, lossfunc)
    valid_loss, valid_acc = evaluate(model, validation_iter, lossfunc)
    
    print('Epoch:', epoch)
    print('Training stats:\n Accuracy:', train_acc, ',Loss:', train_loss)
    print('Validation stats:\n Accuracy:', valid_acc, ',Loss:', valid_loss)

Train batch 20 done.
Train batch 40 done.
Train batch 60 done.
Train batch 80 done.
Train batch 100 done.
Train batch 120 done.
Train batch 140 done.
Evaluate batch 20 done
Epoch: 0
Training stats:
 Accuracy: 0.733031449044586 ,Loss: 0.5203975380226306
Validation stats:
 Accuracy: 0.830859375 ,Loss: 0.3836495392024517
Train batch 20 done.
Train batch 40 done.
Train batch 60 done.
Train batch 80 done.
Train batch 100 done.
Train batch 120 done.
Train batch 140 done.
Evaluate batch 20 done
Epoch: 1
Training stats:
 Accuracy: 0.8682324840764332 ,Loss: 0.315547597730995
Validation stats:
 Accuracy: 0.8759765625 ,Loss: 0.29109086766839026
Train batch 20 done.
Train batch 40 done.
Train batch 60 done.
Train batch 80 done.
Train batch 100 done.
Train batch 120 done.
Train batch 140 done.
Evaluate batch 20 done
Epoch: 2
Training stats:
 Accuracy: 0.9106787420382165 ,Loss: 0.2325771002063326
Validation stats:
 Accuracy: 0.88984375 ,Loss: 0.26478078290820123
Train batch 20 done.
Train batch 40 d

In [22]:
#testing
_, test_acc = evaluate(model, test_iter, lossfunc)
print('Test accuracy:', test_acc)

Evaluate batch 20 done
Evaluate batch 40 done
Evaluate batch 60 done
Evaluate batch 80 done
Evaluate batch 100 done
Evaluate batch 120 done
Evaluate batch 140 done
Evaluate batch 160 done
Evaluate batch 180 done
Test accuracy: 0.8861208545918368
