# pipeline of work with  RNN Steganografic Detector

In [None]:
#! pip install transformers

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset

import spacy

import numpy as np

import random
import math
import time

from tqdm import tqdm

from sklearn.metrics import roc_auc_score

import json

SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

! python -m spacy download en

spacy_en = spacy.load('en')

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [213]:
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_en,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

# should be 0 if the sentence is natural and 1 if this is encoded
LABEL = LabelField(dtype = torch.float)

Loading a custom dataset

In [214]:
dir_path = "./data/"
dataset_name = "twitter"

In [215]:
fields = {'text': ("text",SRC), 'label': ("label",LABEL)}

In [216]:
gen_data, = TabularDataset.splits(
                            path = '.',
                            root=".",
                            train = dir_path + dataset_name + "_attacker_data" + "FLC_64.json",
                            format = 'json',
                            fields = fields
)

In [217]:
train_data, valid_data = gen_data.split(random_state = random.seed(SEED))

In [218]:
MAX_VOCAB_SIZE = 15_000

SRC.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [219]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [220]:
train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    device = device)


In [None]:
from model import RNNStegaDetector

In [222]:
model = RNNStegaDetector(batch_size=BATCH_SIZE, 
            output_size=1, 
            hidden_size=100, 
            vocab_size=len(SRC.vocab), 
            n_layers=2,
            embedding_length=80,
            pad_idx=SRC.vocab.stoi[SRC.pad_token], 
            dropout=0.5, 
            bidirectional=True)

In [223]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 831,721 trainable parameters


In [224]:
pretrained_embeddings = SRC.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([5554, 100])


Training procedure for the detector

In [225]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:  
        optimizer.zero_grad()

        # print(batch.text)
        # text, text_lengths = batch.text
        text = batch.text

        predictions = model(text).squeeze(1)

        loss = criterion(predictions, batch.label)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.label).float()
        acc = correct.sum() / len(correct)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [226]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    roc_auc = 0
    all_target = []
    all_pred_probs = []

    model.eval()

    with torch.no_grad():

        for batch in iterator: 
            text = batch.text
            predictions = model(text).squeeze(1)

            loss = criterion(predictions, batch.label)

            rounded_preds = torch.round(torch.sigmoid(predictions))
            correct = (rounded_preds == batch.label).float()
            acc = correct.sum() / len(correct)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
            all_target.extend(batch.label.tolist())
            all_pred_probs.extend(torch.sigmoid(predictions).tolist())


    roc_auc = roc_auc_score(all_target, all_pred_probs)

    return epoch_loss / len(iterator), epoch_acc / len(iterator), roc_auc

In [227]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

optimizer = torch.optim.Adam(model.parameters())

In [228]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
#vlc2
N_EPOCHS = 8

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, valid_roc_auc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    # epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {end_time - start_time} s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | ROC AUC:{valid_roc_auc:.3f}')
    
model = model.eval()    
torch.save(model, "./models/" + dataset_name + '_attacker_vlc2.pth')

Epoch: 01 | Epoch Time: 0.9952347278594971 s
	Train Loss: 0.615 | Train Acc: 74.53%
	 Val. Loss: 0.499 |  Val. Acc: 65.31% | ROC AUC:0.957
Epoch: 02 | Epoch Time: 0.9470505714416504 s
	Train Loss: 0.059 | Train Acc: 99.22%
	 Val. Loss: 0.597 |  Val. Acc: 70.00% | ROC AUC:0.975
Epoch: 03 | Epoch Time: 0.9175355434417725 s
	Train Loss: 0.027 | Train Acc: 99.35%
	 Val. Loss: 0.564 |  Val. Acc: 74.53% | ROC AUC:0.976
Epoch: 04 | Epoch Time: 0.9790775775909424 s
	Train Loss: 0.018 | Train Acc: 99.56%
	 Val. Loss: 0.522 |  Val. Acc: 76.09% | ROC AUC:0.981
Epoch: 05 | Epoch Time: 0.9635536670684814 s
	Train Loss: 0.035 | Train Acc: 98.79%
	 Val. Loss: 0.329 |  Val. Acc: 81.67% | ROC AUC:0.978
Epoch: 06 | Epoch Time: 0.9345638751983643 s
	Train Loss: 0.031 | Train Acc: 99.14%
	 Val. Loss: 0.368 |  Val. Acc: 79.69% | ROC AUC:0.978
Epoch: 07 | Epoch Time: 0.9509716033935547 s
	Train Loss: 0.025 | Train Acc: 99.57%
	 Val. Loss: 0.708 |  Val. Acc: 69.53% | ROC AUC:0.984
Epoch: 08 | Epoch Time: 0.9