In [1]:
import torch
import torch.nn as nn 
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision.datasets as dsets 
import torchvision.transforms as transforms
import torchtext 
from torchtext import datasets 
from torchtext import data
from gensim.models import KeyedVectors
import gensim
import nltk
import random

In [2]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [5]:
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train_data)
dir(TEXT)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'batch_first',
 'build_vocab',
 'dtype',
 'dtypes',
 'eos_token',
 'fix_length',
 'ignore',
 'include_lengths',
 'init_token',
 'is_target',
 'lower',
 'numericalize',
 'pad',
 'pad_first',
 'pad_token',
 'postprocessing',
 'preprocess',
 'preprocessing',
 'process',
 'sequential',
 'stop_words',
 'tokenize',
 'tokenizer_args',
 'truncate_first',
 'unk_token',
 'use_vocab',
 'vocab',
 'vocab_cls']

In [7]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)


In [128]:
class CNNmodel(nn.Module):
    def __init__(self, vocab_size,embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super(CNNmodel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_0 = nn.Conv1d(in_channels = 1, out_channels = n_filters, kernel_size=(filter_sizes[0], embedding_dim))
        self.conv_1 = nn.Conv1d(in_channels = 1, out_channels = n_filters, kernel_size=(filter_sizes[1], embedding_dim))
        self.conv_2 = nn.Conv1d(in_channels = 1, out_channels = n_filters, kernel_size=(filter_sizes[2], embedding_dim))
        self.linear = nn.Linear(len(filter_sizes)*n_filters, output_dim)#
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        x = x.permute(1,0)
        embedded = self.embedding(x) #o/p->[batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)#o/p->[batch size,1, sent len, emb dim]
        conved_0 = F.relu(self.conv_0(embedded.squeeze(3)))
        conved_1 = F.relu(self.conv_1(embedded.squeeze(3)))
        conved_2 = F.relu(self.conv_2(embedded.squeeze(3)))
        #print(conved_0.shape)
        #print(conved_0.squeeze(3).shape)
        pooled_0 = F.max_pool1d(conved_0.squeeze(3), conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1.squeeze(3), conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2.squeeze(3), conved_2.shape[2]).squeeze(2)
        out = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
        out = self.linear(out)
        return out 

In [4]:
input_dim = len(TEXT.vocab)
# embedding_dim = 100
# n_filters = 100
# filter_sizes = [3,4,5]
# output_dim = 1
# dropout = 0.5

# model = CNNmodel(input_dim,embedding_dim, n_filters , filter_sizes, output_dim, dropout)


25002

In [130]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0854, -0.5470, -0.1147,  ..., -0.0055, -0.1925, -0.1269],
        [-0.1165,  0.7572, -0.0685,  ..., -0.3802,  1.2211,  0.7497],
        [ 0.0892, -0.0228,  0.2573,  ..., -0.5667, -0.6201,  0.3261]])

In [131]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [132]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [133]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [134]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [135]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.509 | Train Acc: 73.97% | Val. Loss: 0.357 | Val. Acc: 84.16% |
| Epoch: 02 | Train Loss: 0.307 | Train Acc: 87.39% | Val. Loss: 0.285 | Val. Acc: 87.92% |
| Epoch: 03 | Train Loss: 0.224 | Train Acc: 91.32% | Val. Loss: 0.266 | Val. Acc: 88.88% |
| Epoch: 04 | Train Loss: 0.151 | Train Acc: 94.48% | Val. Loss: 0.271 | Val. Acc: 89.17% |
| Epoch: 05 | Train Loss: 0.089 | Train Acc: 97.09% | Val. Loss: 0.280 | Val. Acc: 89.35% |


In [136]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.299 | Test Acc: 88.76% |
