<a href="https://colab.research.google.com/github/shahriarshayesteh/capstone_project/blob/master/CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:

import torch
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import time
import random

torch.backends.cudnn.deterministic = True

In [0]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 1e-4
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1

In [20]:
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(RANDOM_SEED),
                                          split_ratio=0.8)

print(f'Num Train: {len(train_data)}')
print(f'Num Valid: {len(valid_data)}')
print(f'Num Test: {len(test_data)}')

Num Train: 20000
Num Valid: 5000
Num Test: 25000


In [21]:
#TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
MAX_VOCAB_SIZE = 30_000
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 # Load pretrained embeddings
                 vectors = "glove.6B.100d", 
                 # Set unknown vectors
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

.vector_cache/glove.6B.zip: 862MB [06:32, 2.20MB/s]                          
100%|█████████▉| 399539/400000 [00:16<00:00, 22580.10it/s]

Vocabulary size: 30002
Number of classes: 2


In [0]:
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=DEVICE)


train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=DEVICE)

In [23]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.text.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.text.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.text.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([1150, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([61, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([42, 128])
Target vector size: torch.Size([128])


In [24]:
'''
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #[sentence len, batch size] => [sentence len, batch size, embedding size]
        embedded = self.embedding(text)
        
        #[sentence len, batch size, embedding size] => 
        #  output: [sentence len, batch size, hidden size]
        #  hidden: [1, batch size, hidden size]
        output, hidden = self.rnn(embedded)
        
        return self.fc(hidden.squeeze(0)).view(-1)
      '''
      

'\nimport torch.nn as nn\n\nclass CNN(nn.Module):\n    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):\n        \n        super().__init__()\n        \n        self.embedding = nn.Embedding(input_dim, embedding_dim)\n        self.rnn = nn.RNN(embedding_dim, hidden_dim)\n        self.fc = nn.Linear(hidden_dim, output_dim)\n        \n    def forward(self, text):\n\n        #[sentence len, batch size] => [sentence len, batch size, embedding size]\n        embedded = self.embedding(text)\n        \n        #[sentence len, batch size, embedding size] => \n        #  output: [sentence len, batch size, hidden size]\n        #  hidden: [1, batch size, hidden size]\n        output, hidden = self.rnn(embedded)\n        \n        return self.fc(hidden.squeeze(0)).view(-1)\n      '

In [0]:
import torch.nn as nn

class CNN_Text(nn.Module):
    ''' Define network architecture and forward path. '''
    def __init__(self, vocab_size, 
                 vector_size, n_filters, 
                 filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        # Create word embeddings from the input words     
        self.embedding = nn.Embedding(vocab_size, vector_size, 
                                      padding_idx = pad_idx)
        
        # Specify convolutions with filters of different sizes (fs)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, vector_size)) 
                                    for fs in filter_sizes])
        
        # Add a fully connected layer for final predicitons
        self.linear = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        # Drop some of the nodes to increase robustness in training
        self.dropout = nn.Dropout(dropout)
        
        
        
    def forward(self, text):
        '''Forward path of the network.'''       
        # Get word embeddings and formt them for convolutions
        embedded = self.embedding(text).unsqueeze(1)
        
        # Perform convolutions and apply activation functions
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        # Pooling layer to reduce dimensionality    
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # Dropout layer
        cat = self.dropout(torch.cat(pooled, dim = 1))
        return self.linear(cat)

In [0]:
INPUT_DIM = len(TEXT.vocab)

# Vector size (lower-dimensional repr. of each word)
EMBEDDING_DIM = 100

# Number of filters
N_FILTERS = 100

# N-grams that we want to analuze using filters
FILTER_SIZES = [1, 2, 3, 4, 5]

# Output of the linear layer (prob of a negative review)
OUTPUT_DIM = 1

# Proportion of units to drop
DROPOUT = 0.5

In [0]:
# Initialize model and load pre-trained embeddings

model = CNN_Text(INPUT_DIM, EMBEDDING_DIM, 
            N_FILTERS, FILTER_SIZES, 
            OUTPUT_DIM, DROPOUT, 0)

model.embedding.weight.data.copy_(TEXT.vocab.vectors)

# Zero the initial weights of the UNKnown and padding tokens.
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

# The string token used as padding. Default: “<pad>”.
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model = model.to(DEVICE)

In [29]:
print(TEXT.vocab.vectors)

tensor([[ 0.3374, -0.1778, -0.3035,  ...,  0.2770,  0.6455, -0.8957],
        [ 0.4928, -0.0141, -0.2747,  ...,  0.0493,  0.8484,  0.4671],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.6423,  0.4794,  0.3905,  ...,  0.2855,  0.2942,  0.3744],
        [-1.4609,  2.1579, -0.5844,  ...,  0.1178, -1.2751, -0.7856],
        [ 0.3146, -0.0125, -0.6412,  ...,  0.4166,  0.6038,  0.0309]])


In [0]:
# Helper functions
def accuracy(preds, y):
    """ Return accuracy per batch. """
    correct = (torch.round(torch.sigmoid(preds)) == y).float() 
    return correct.sum() / len(correct)

def epoch_time(start_time, end_time):
    '''Track training time. '''
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
def train(model, iterator, optimizer, criterion):
    '''Train the model with specified data, optimizer, and loss function. '''
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        # Reset the gradient to not use them in multiple passes 
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = accuracy(predictions, batch.label)
        
        # Backprop
        loss.backward()
        
        # Optimize the weights
        optimizer.step()
        
        # Record accuracy and loss
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    '''Evaluate model performance. '''
    epoch_loss = 0
    epoch_acc = 0
    
    # Turm off dropout while evaluating
    model.eval()
    
    # No need to backprop in eval
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import torch.optim as optim

# Network optimizer
optimizer = optim.Adam(model.parameters())

# Loss function
criterion = nn.BCEWithLogitsLoss()

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [41]:
# Training loop
N_EPOCHS = 10

best_valid_loss = float('inf')
val_loss = []
val_acc = []
tr_loss = []
tr_acc = []

for epoch in range(N_EPOCHS):
    
    # Calculate training time
    start_time = time.time()
    
    # Get epoch losses and accuracies 
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Save training metrics
    val_loss.append(valid_loss)
    val_acc.append(valid_acc)
    tr_loss.append(train_loss)
    tr_acc.append(train_acc)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'CNN-model.pt')
    
    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

ValueError: ignored