# Example of pipeline of work with Steganografic Detector

In [None]:
#! pip install transformers

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset

import spacy

import numpy as np

import random
import math
import time

from tqdm import tqdm

import json

SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

! python -m spacy download en

spacy_en = spacy.load('en')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
import numpy as np 
import transformer_generate
from transformer_generate import init_model, generate, decode

In [None]:
from itertools import chain

In [None]:
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>',
            eos_token = '<eos>', 
            lower = True)

# should be 0 if the sentence is natural and 1 if this is encoded
LABEL = LabelField(dtype = torch.float)

In [None]:
list(gen_data.text)

[['hello', ',', 'my', 'name', 'is', 'ilya'],
 ['nice', 'to', 'meet', 'you', '!']]

In [None]:
list(gen_data.label)

[0, 0]

Using of IMDB dataset as an original sentences

In [None]:
train_data, test_data = IMDB.splits(SRC, LABEL)

# train_data.label = (1 for _ in range(len(train_data)))
# test_data.label = (1 for _ in range(len(train_data)))

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [None]:
len(train_data)

17500

Custom dataset uploading


In [None]:
fields = {'text': ("text",SRC), 'label': ("label",LABEL)}

In [None]:
gen_data, = TabularDataset.splits(
                            path = '.',
                            root=".",
                            train = '../data/generated.json',
                            format = 'json',
                            fields = fields
)

In [None]:
#labelling original sentences by 1
def ones_generator(data):
    for i in data:
        yield 1

In [None]:
# concatenate generated and original datasets

train_data.text = chain(train_data.text, gen_data.text)

train_data.label = chain(ones_generator(train_data.label), gen_data.label)

In [None]:
MAX_VOCAB_SIZE = 25_000

SRC.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)


In [None]:
from model import RNNStegaDetector

In [None]:
model = RNNStegaDetector(batch_size=BATCH_SIZE, 
            output_size=1, 
            hidden_size=256, 
            vocab_size=len(SRC.vocab), 
            n_layers=2,
            embedding_length=100, 
            pad_idx=SRC.vocab.stoi[SRC.pad_token], 
            dropout=0.5, 
            bidirectional=True)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,811,057 trainable parameters


In [None]:
pretrained_embeddings = SRC.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25004, 100])


Training procedure for the detector

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:  
        optimizer.zero_grad()

        # print(batch.text)
        # text, text_lengths = batch.text
        text = batch.text

        predictions = model(text).squeeze(1)

        loss = criterion(predictions, batch.label)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.label).float()
        acc = correct.sum() / len(correct)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator: 
            text = batch.text
            # print(text.size())
            predictions = model(text).squeeze(1)

            loss = criterion(predictions, batch.label)

            rounded_preds = torch.round(torch.sigmoid(predictions))
            correct = (rounded_preds == batch.label).float()
            acc = correct.sum() / len(correct)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

optimizer = torch.optim.Adam(model.parameters())

In [None]:
#usual sentiment analysis
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    # epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {end_time - start_time} s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 96.55485939979553 s
	Train Loss: 0.679 | Train Acc: 56.77%
	 Val. Loss: 0.671 |  Val. Acc: 59.24%
Epoch: 02 | Epoch Time: 96.73384046554565 s
	Train Loss: 0.656 | Train Acc: 61.49%
	 Val. Loss: 0.635 |  Val. Acc: 63.05%
Epoch: 03 | Epoch Time: 96.63778114318848 s
	Train Loss: 0.544 | Train Acc: 72.76%
	 Val. Loss: 0.468 |  Val. Acc: 77.82%
Epoch: 04 | Epoch Time: 96.41710114479065 s
	Train Loss: 0.452 | Train Acc: 79.33%
	 Val. Loss: 0.386 |  Val. Acc: 83.09%
Epoch: 05 | Epoch Time: 96.74099159240723 s
	Train Loss: 0.383 | Train Acc: 83.44%
	 Val. Loss: 0.335 |  Val. Acc: 85.70%


In [None]:
valid_loss, valid_acc

(0.33492719855601505, 0.8569915254237288)