In [1]:
# !python -m spacy download en_core_web_sm
# import spacy
# spacy.load('en_core_web_sm')

In [2]:
import spacy
import torch
import torchtext
from torchtext.legacy import datasets, data
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Containers for tokenisation
# using tokenize="spacy" because it's the best.
text_field = data.Field(tokenize="spacy", tokenizer_language="en_core_web_sm", fix_length=100)
label_field = data.LabelField(dtype=torch.float) # torch.float because GPUs use floats

# Load dataset and split to train and test data
# IMDB dataset (about movies)
train, test = datasets.IMDB.splits(text_field=text_field, label_field=label_field)

In [4]:
# Split to train and validation set - 80% to train_set, 20% to validation_set
# The original set is 25k descriptions(?) so train_set after the split is 20k and valid_set is 5k.
train_set, valid_set = train.split(0.8)
len(train_set), len(valid_set)  # 20_000, 5_000
text_field.build_vocab(train_set, max_size=25_000)
label_field.build_vocab(train_set)

assert len(text_field.vocab) == 25_002

In [5]:
# Map int to string and string to int
# text_field.vocab.itos[186] -> 'though'
# text_field.vocab.stoi['though'] -> 186

In [6]:
text_field.vocab.itos[:10]

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']

In [7]:
len(max(train_set, key=lambda x: len(x.text)).text)

# but we can do better!
train_buckets, valid_buckets, test_buckets = data.BucketIterator.splits(
    (train_set, valid_set, test), batch_size=64, device=device
)

In [8]:
from torch import nn


class NLPModule(nn.Module):
    def __init__(self, num_embedding, embedding_dim, hidden_size, out_features):
        # before parent
        super().__init__()
        # after parent
        # warstwa osadzeń/osadzanie(?) embedding
        # wektory w przestrzeni znaczeniowej słów
        self.embedding = nn.Embedding(num_embedding, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_size, 1)
        self.linear = nn.Linear(hidden_size, out_features)

    def forward(self, input):
        embed_output = self.embedding(input)
        rnn_output, hidden_output = self.rnn(embed_output)
        # hidden_output is the same as rnn_output[-1]
        lin_output = self.linear(hidden_output)

        return lin_output


class NLPModuleLSTM(nn.Module):
    def __init__(self, num_embedding, embedding_dim, hidden_size, out_features):
        # before parent
        super().__init__()
        # after parent
        # warstwa osadzeń/osadzanie(?) embedding
        # wektory w przestrzeni znaczeniowej słów
        self.embedding = nn.Embedding(num_embedding, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_size, 2)
        self.linear = nn.Linear(hidden_size * 2, out_features)
        self.dropout = nn.Dropout()

    def forward(self, input):
        embed_output = self.embedding(input)
        lstm_output, (hidden_output1, hidden_output2) = self.lstm(embed_output)
        drop_output = self.dropout(torch.cat((hidden_output1[-2, :, :], hidden_output1[-1, :, :]), dim=1))
        lin_output = self.linear(drop_output)

        return lin_output

In [9]:
num_embedding = len(text_field.vocab)
embedding_dim = 100
hidden_size = 256
out_features = 1

# num_embedding, embedding_dim, hidden_size, out_features

model = NLPModuleLSTM(num_embedding, embedding_dim, hidden_size, out_features)

In [10]:
def policz(mod):
    return sum(p.numel() for p in mod.parameters())


policz(model)

3393641

In [11]:
# Stochastic gradient descent SGD
# minimalizować funkcję kosztu (szukanie minimum)

import torch.optim as optim

optimiser = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

In [12]:
ciretrion = criterion.to(device)
model = model.to(device)

def binary_accuracy(prediction, target):
    prediction = F.sigmoid(prediction)
    prediction = torch.round(prediction)
    
    compared = (prediction == target).float()
    return torch.mean(compared)


T = torch.tensor
binary_accuracy(T([0, 0.5, .2, 0.001, 0.8]), T([0, 1, 1, 1, 1]))



tensor(1.)

In [13]:
import numpy as np
import tqdm

def train(mod, data, optimiser, criterion):
    losses = []
    metrics = []

    # train pozwala na akumulację błędów, które potem będziemy propagować wstecz
    mod.train()

    for bucket in tqdm.tqdm(data):
        optimiser.zero_grad()
        output = mod(bucket.text).squeeze(0).squeeze(1)
        loss = criterion(output, bucket.label)
        metric = binary_accuracy(output, bucket.label)
        losses.append(loss.item())
        metrics.append(metric.item())
        loss.backward()
        optimiser.step()
        
        # print(np.mean(losses), losses[-1], np.mean(metrics), metrics[-1])

    return losses, metrics

In [14]:
def validate(mod, data, criterion):
    losses = []
    metrics = []

    # wyłącza akumulacje błędów (z którego korzystaliśmy w train)
    mod.eval()

    for bucket in tqdm.tqdm(data):
        output = mod(bucket.text).squeeze(0).squeeze(1)
        loss = criterion(output, bucket.label)
        metric = binary_accuracy(output, bucket.label)
        losses.append(loss.item())
        metrics.append(metric.item())        
        # print(np.mean(losses), losses[-1], np.mean(metrics), metrics[-1])

    return losses, metrics



In [15]:
for i in range(5):
  train_losses, train_metrics = train(model, train_buckets, optimiser, criterion)
  validated_losses, validated_metrics = validate(model, valid_buckets, criterion)
  
  print()
  print("Train metrics", np.mean(train_losses), np.mean(train_metrics))
  print("Validation metrics", np.mean(validated_losses), np.mean(validated_metrics))


100%|██████████| 313/313 [00:05<00:00, 54.33it/s]
100%|██████████| 79/79 [00:00<00:00, 142.36it/s]
  2%|▏         | 5/313 [00:00<00:07, 43.34it/s]


Train metrics 0.6911784246706734 0.5428813897763578
Validation metrics 0.693934605091433 0.520371835443038


100%|██████████| 313/313 [00:05<00:00, 54.92it/s]
100%|██████████| 79/79 [00:00<00:00, 146.50it/s]
  1%|▏         | 4/313 [00:00<00:08, 38.27it/s]


Train metrics 0.6942560600396543 0.5196685303514377
Validation metrics 0.6885908042328267 0.53125


100%|██████████| 313/313 [00:05<00:00, 54.21it/s]
100%|██████████| 79/79 [00:00<00:00, 142.88it/s]
  2%|▏         | 5/313 [00:00<00:07, 42.35it/s]


Train metrics 0.6833530725381626 0.5558107028753994
Validation metrics 0.6809024735342099 0.5814873417721519


100%|██████████| 313/313 [00:05<00:00, 53.78it/s]
100%|██████████| 79/79 [00:00<00:00, 146.08it/s]
  2%|▏         | 5/313 [00:00<00:06, 44.85it/s]


Train metrics 0.6288763372281108 0.647064696485623
Validation metrics 0.6289972387537172 0.6572389240506329


100%|██████████| 313/313 [00:05<00:00, 53.76it/s]
100%|██████████| 79/79 [00:00<00:00, 143.64it/s]


Train metrics 0.46747046947098386 0.7854932108626198
Validation metrics 0.4758951554570017 0.7792721518987342





In [16]:

# Funkcja kosztu, im bliżej 1 (target) tym funkcja kosztu maleje.

target = torch.ones([1, 1], dtype=torch.float32)  # 64 classes, batch size = 10
input_ = torch.full([1, 1], 0.1)  # A prediction (logit)

print(F.binary_cross_entropy_with_logits(input_, target))

target = torch.ones([1, 1], dtype=torch.float32)  # 64 classes, batch size = 10
input_ = torch.full([1, 1], 0.4)  # A prediction (logit)

print(F.binary_cross_entropy_with_logits(input_, target))

target = torch.ones([1, 1], dtype=torch.float32)  # 64 classes, batch size = 10
input_ = torch.full([1, 1], 0.7)  # A prediction (logit)

print(F.binary_cross_entropy_with_logits(input_, target))

target = torch.ones([1, 1], dtype=torch.float32)  # 64 classes, batch size = 10
input_ = torch.full([1, 1], 0.9)  # A prediction (logit)

print(F.binary_cross_entropy_with_logits(input_, target))

target, input_

tensor(0.6444)
tensor(0.5130)
tensor(0.4032)
tensor(0.3412)


(tensor([[1.]]), tensor([[0.9000]]))

In [17]:
>>> rnn = nn.RNN(3, 2, 1)
>>> input = torch.randn(5, 3, 3)
>>> h0 = torch.randn(1, 3, 2)
>>> output, hn = rnn(input, h0)
output, hn

(tensor([[[-0.8282, -0.1208],
          [-0.2118,  0.8283],
          [ 0.0370,  0.9418]],
 
         [[-0.4396,  0.7438],
          [ 0.0233,  0.9387],
          [ 0.6401,  0.9846]],
 
         [[-0.4712,  0.8235],
          [ 0.3112,  0.9545],
          [ 0.8612,  0.9959]],
 
         [[ 0.2969,  0.9496],
          [ 0.2175,  0.9487],
          [ 0.7593,  0.9900]],
 
         [[ 0.0371,  0.9331],
          [ 0.4106,  0.9637],
          [-0.0856,  0.9606]]], grad_fn=<StackBackward>),
 tensor([[[ 0.0371,  0.9331],
          [ 0.4106,  0.9637],
          [-0.0856,  0.9606]]], grad_fn=<StackBackward>))

In [18]:
>>> # an Embedding module containing 10 tensors of size 3
>>> embedding = nn.Embedding(100, 19)
>>> # a batch of 2 samples of 4 indices each
>>> input = torch.LongTensor([[1,98,1,0, 4,3,2,9],[4,3,2,9, 4,3,2,9]])
>>> embedding(input)

tensor([[[ 2.0889,  0.3945, -1.1772, -0.6415, -0.0728, -1.1369,  1.0803,
          -0.6445,  0.9511, -0.7240, -0.2653, -0.2707,  0.5510,  1.1052,
          -0.1564, -0.4860, -0.6715,  0.5398, -0.6071],
         [-0.5348,  0.1082,  2.0761,  1.6011, -0.5353,  1.5460,  1.2302,
          -0.3847,  0.5380, -0.5662,  2.0162, -0.2915,  0.6697,  1.4221,
           0.0464,  1.0514,  0.0864,  1.3192,  0.7933],
         [ 2.0889,  0.3945, -1.1772, -0.6415, -0.0728, -1.1369,  1.0803,
          -0.6445,  0.9511, -0.7240, -0.2653, -0.2707,  0.5510,  1.1052,
          -0.1564, -0.4860, -0.6715,  0.5398, -0.6071],
         [-1.3756,  0.8178, -0.8051, -1.9905, -0.7490,  0.2198,  1.6624,
           0.7889, -0.0715, -0.9256, -0.0417,  0.7219, -1.7215, -1.6133,
           0.1947,  0.0199,  0.2287,  0.5282,  1.6839],
         [ 0.7632, -0.3519,  0.0248, -1.1205,  0.5446, -0.9172, -0.0353,
          -2.3555, -0.6074, -1.5454, -1.2395,  0.4797,  1.4636, -0.2845,
          -1.0257,  1.8475, -0.1952, -0.8892, 