# Example of pipeline of work with Steganografic Detector

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator

import spacy

import random
import math
import time

SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

! python -m spacy download en

spacy_en = spacy.load('en')

In [13]:
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>',
            eos_token = '<eos>', 
            lower = True)

# should be 0 if the sentence is natural and 1 if this is encoded
LABEL = LabelField(dtype = torch.float)

In [72]:
tokenize_en("Hello, my name is Ilya")

['Hello', ',', 'my', 'name', 'is', 'Ilya']

Using of IMDB dataset as an example

In [17]:
train_data, test_data = IMDB.splits(SRC, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [21]:
MAX_VOCAB_SIZE = 25_000

SRC.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 399690/400000 [00:17<00:00, 21710.92it/s]

In [22]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)


In [None]:
from model import RNNStegaDetector

In [40]:
model = RNNStegaDetector(batch_size=BATCH_SIZE, 
            output_size=1, 
            hidden_size=256, 
            vocab_size=len(SRC.vocab), 
            n_layers=2,
            embedding_length=100, 
            pad_idx=SRC.vocab.stoi[SRC.pad_token], 
            dropout=0.5, 
            bidirectional=True)

In [41]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,811,057 trainable parameters


In [43]:
pretrained_embeddings = SRC.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25004, 100])


Example of training procedure


In [68]:
def train(model, iterator, optimizer, criterion):
    
  epoch_loss = 0
  epoch_acc = 0
  
  model.train()
  
  for batch in iterator:  
    optimizer.zero_grad()

    print(batch.text.size())
    text, text_lengths = batch.text
    
    predictions = model(text, text_lengths).squeeze(1)
    
    loss = criterion(predictions, batch.label)
    
    acc = binary_accuracy(predictions, batch.label)
    
    loss.backward()
    
    optimizer.step()
    
    epoch_loss += loss.item()
    epoch_acc += acc.item()
      
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [69]:
def evaluate(model, iterator, criterion):
    
  epoch_loss = 0
  epoch_acc = 0
  
  model.eval()
  
  with torch.no_grad():
  
    for batch in iterator: 
      text = batch.text
      predictions = model(text, text_lengths).squeeze(1)
      
      loss = criterion(predictions, batch.label)
                  
      rounded_preds = torch.round(torch.sigmoid(predictions))
      correct = (rounded_preds == batch.label).float()
      acc = correct.sum() / len(correct)

      epoch_loss += loss.item()
      epoch_acc += acc.item()
      
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [70]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

optimizer = torch.optim.Adam(model.parameters())

In [None]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')