# Zjednodušení textu

In [89]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import os

BASE_PATH = os.getcwd() + "/Data"

BEST_MODEL_PATH = "%s\\best_model.pt" % (BASE_PATH)

TRAIN_DATA_PATH = BASE_PATH + "/train.csv"
VALID_DATA_PATH = BASE_PATH + "/valid.csv"
TEST_DATA_PATH = BASE_PATH + "/test.csv"

TRAIN_FILE_SRC = BASE_PATH + "/wiki.full.aner.ori.train.src"
VALID_FILE_SRC = BASE_PATH + "/wiki.full.aner.ori.valid.src"
TEST_FILE_SRC = BASE_PATH + "/wiki.full.aner.ori.test.src"
TRAIN_FILE_DST = BASE_PATH + "/wiki.full.aner.ori.train.dst"
VALID_FILE_DST = BASE_PATH + "/wiki.full.aner.ori.valid.dst"
TEST_FILE_DST = BASE_PATH + "/wiki.full.aner.ori.test.dst"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Tokenizer a slovník

V datasetu není použitá jen angličtina, ale i místní názvy a jména v různých jazycích

In [64]:
from collections import Counter
import re
import html

class Vocab:
    def __init__(self, min_freq):
        self.word2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        self.idx2word = {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "<unk>"}
        self.freq = Counter()
        self.min_freq = min_freq

    def build(self, sentences):
        for s in sentences:
            self.freq.update(Vocab.tokenize(s))
        for word, count in self.freq.items():
            if count >= self.min_freq and word not in self.word2idx:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word

    def encode(self, sentence):
        tokens = self.tokenize(sentence)
        return [self.word2idx.get(w, self.word2idx["<unk>"]) for w in tokens]

    def decode(self, indices):
        return " ".join([self.idx2word.get(idx, "<unk>") for idx in indices])

    def __len__(self):
        return len(self.word2idx)

    @staticmethod
    def tokenize(sentence):
        s = sentence
        
        # Převede HTML entity na jejich znaky (např. &ndash; → –)
        s = html.unescape(s)
        # Nahrazení -LRB-/-RRB- závorkami
        s = s.replace("-LRB-", "(").replace("-RRB-", ")")
        # Odstranění závorek s nesmyslným obsahem (např. "( , ; , ; )")
        s = re.sub(r'\(\s*[,;:.!? ]+\s*\)', '', s)        
        s = s.lower()
        s = re.sub(r"[^a-z0-9 .!?]", "", s)
        return s.strip().split()

## Dataset
Použit dataset waboucay/wikilarge.

wiki.full.aner.ori.train.src, wiki.full.aner.ori.train.src >> 296403 vět<br>
wiki.full.aner.ori.valid.dst + wiki.full.aner.ori.valid.dst" >> 993 vět<br>
wiki.full.aner.ori.test.src, wiki.full.aner.ori.test.dst >> 360 vět<br>

In [62]:
def convert_to_csv(src_filename, dst_filename, output_filename):
  with open(src_filename, 'r', encoding='utf-8') as f_src:
      src_lines = [line.strip() for line in f_src.readlines()]
  with open(dst_filename, 'r', encoding='utf-8') as f_dst:
      dst_lines = [line.strip() for line in f_dst.readlines()]

  assert len(src_lines) == len(dst_lines), "Počet vět v src a dst nesouhlasí!"

  df = pd.DataFrame({'complex': src_lines, 'simple': dst_lines})

  df.dropna(subset=["complex", "simple"])    
  df["complex"] = df["complex"].astype(str)
  df["simple"] = df["simple"].astype(str)
  df = df[(df["complex"].str.strip() != "") & (df["simple"].str.strip() != "")]

  df.to_csv(output_filename, index=False)
  print(f'Uloženo do souboru: {output_filename}')


if not os.path.isfile(TRAIN_DATA_PATH):
  convert_to_csv(TRAIN_FILE_SRC, TRAIN_FILE_DST, TRAIN_DATA_PATH)

if not os.path.isfile(VALID_DATA_PATH):
  convert_to_csv(VALID_FILE_SRC, VALID_FILE_DST, VALID_DATA_PATH)

if not os.path.isfile(TEST_DATA_PATH):
  convert_to_csv(TEST_FILE_SRC, TEST_FILE_DST, TEST_DATA_PATH)

In [None]:
from torch.utils.data import Dataset, DataLoader

class SimplificationDataset(Dataset):
    def __init__(self, df, vocab, max_len=30):
        self.inputs = df['complex'].tolist()
        self.targets = df['simple'].tolist()
        self.vocab = Vocab()
        self.vocab.build(self + train_df['simple'].tolist())
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        src = self.vocab.encode(self.inputs[idx])[:self.max_len-1]
        tgt = self.vocab.encode(self.targets[idx])[:self.max_len-2]
        src = src + [self.vocab.word2idx["<eos>"]]
        tgt = [self.vocab.word2idx["<sos>"]] + tgt + [self.vocab.word2idx["<eos>"]]
        src += [0] * (self.max_len - len(src))
        tgt += [0] * (self.max_len - len(tgt))
        return torch.tensor(src), torch.tensor(tgt)
    
    def get_data_loader(self, batch_size, shuffle):
        return DataLoader(self, batch_size, shuffle)

## Encoder

In [None]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers=1, dropout = 0.3):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim)        
        self.lstm = nn.LSTM(
            emb_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        emb = self.embed(x)
        _, (h, c) = self.lstm(emb)
        return _, h, c
    
class EncoderWithAttention(Encoder):
    def forward(self, x):
        emb = self.embed(x)
        o, (h, c) = self.lstm(emb)
        return o, h, c

# Decoder

In [66]:
import torch.nn.functional as F

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(
            emb_dim,
            hidden_dim,
            num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, h, c):
        emb = self.embed(x.unsqueeze(1))
        output, (h, c) = self.lstm(emb, (h, c))
        logits = self.fc(output.squeeze(1))
        return logits, h, c

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim

    def forward(self, hidden, encoder_outputs):
        # obvykle vezmeme poslední vrstvu a první směr, např. hidden[-1]
        # proto pro jednoduchost počítáme jen s posledním hidden state:
        #   vybereme hidden[-1]
        hidden = hidden[-1]
        
        # Spočítat skóre (batch, seq_len)
        attn_scores = torch.bmm(encoder_outputs, hidden.unsqueeze(2)).squeeze(2)
        
        # Softmax přes seq_len dimenzi
        attn_weights = F.softmax(attn_scores, dim=1)
        
        # Vážený součet encoder outputs
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        
        return context, attn_weights

    
class DecoderWithAttention(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim + hidden_dim, hidden_dim, batch_first=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, input_token, hidden, cell, encoder_outputs):
        embedded = self.embedding(input_token).unsqueeze(1)
        context, attn_weights = self.attention(hidden, encoder_outputs)

        # Spojení embedding a contextu
        rnn_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)

        # LSTM krok
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))

        output = output.squeeze(1)
        output = torch.cat((output, context), dim=1)

        output = self.fc(output)
        return output, hidden, cell, attn_weights

## Model

In [67]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self._encoder = encoder
        self._decoder = decoder
        self._device = device

    @property 
    def encoder(self):
        return self._encoder
    
    @property 
    def decoder(self):
        return self._decoder
    
    @property 
    def device(self):
        return self._device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.size()
        vocab_size = self._decoder.fc.out_features
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(self._device)

        h, c = self._encoder(src)
        input = tgt[:, 0]  # <sos>

        for t in range(1, tgt_len):
            output, h, c = self._decoder(input, h, c)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1

        return outputs
    
class Seq2SeqWithAttention(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self._encoder = encoder
        self._decoder = decoder
        self._device = device

    @property 
    def encoder(self):
        return self._encoder
    
    @property 
    def decoder(self):
        return self._decoder
    
    @property 
    def device(self):
        return self._device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.shape
        vocab_size = self._decoder.fc.out_features
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(self._device)

        encoder_outputs, h, c = self._encoder(src)
        
        input = tgt[:, 0]  # <sos>

        for t in range(1, tgt_len):
            output, h, c, _ = self._decoder(input, h, c, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1

        return outputs

## Tréning

In [84]:
class Train:
    def __init__(self, model, dataloader, optimizer, criterion, device, path):
        self._model = model
        self._dataloader = dataloader
        self._optimizer = optimizer
        self._criterion = criterion
        self._device = device
        self._path = path
        self._trace = dict()
        self._max_trace_length = 5
    
    def run(self, epochs):
        self._model.train()
        self._total_loss = 0
        self._trace.clear()
        
        epochs_no_improve = 0
        best_loss = 999999
        best_epoch = 0
                
        for epoch in range(epochs):
            loss = self._epoch()            
            print(f"Epoch {epoch}: loss {loss:.4f}")
            if best_loss > loss:
                best_loss = loss
                best_epoch = epoch
                epochs_no_improve = 0
                torch.save(self._model.state_dict(), self._get_filename())
            else:
                epochs_no_improve += 1                
            if epochs_no_improve >= self._max_trace_length:
                break
            
        print(f"Finished with epoch {best_epoch} with loss {best_loss:.4f}")
        
        self._model.load_state_dict(torch.load(self._get_filename(), weights_only=True))
        
    def _get_filename(self):
        return "%s\\best_model.pt" % (self._path)
    
    def _epoch(self):
        total_loss = 0
        for batch in self._dataloader:
            src, tgt = batch[0].to(self._device), batch[1].to(self._device)
            self._optimizer.zero_grad()
            output = self._model(src, tgt)
            output = output[:, 1:].reshape(-1, output.size(-1))
            tgt = tgt[:, 1:].reshape(-1)
            loss = self._criterion(output, tgt)
            loss.backward()
            self._optimizer.step()
            total_loss += loss.item()
        return total_loss / len(self._dataloader)

class TrainWithAttention(Train):    
    def _epoch(self):
        total_loss = 0
        for batch in self._dataloader:
            src, tgt = batch[0].to(self._device), batch[1].to(self._device)
            self._optimizer.zero_grad()
            
            output = self._model(src, tgt)
            output = output[:, 1:].reshape(-1, output.size(-1))
            tgt = tgt[:, 1:].reshape(-1)
            
            loss = self._criterion(output, tgt)
            loss.backward()
            self._optimizer.step()
            total_loss += loss.item()
        return total_loss / len(self._dataloader)

In [None]:
train_df = pd.read_csv(TRAIN_DATA_PATH)[:10000] # vezmeme mensi podmnozinu pro zrychleni
vocab = Vocab(min_freq=1)
vocab.build(train_df['complex'].tolist() + train_df['simple'].tolist())

train_dataset = SimplificationDataset(train_df, vocab)
train_loader = train_dataset.get_data_loader(batch_size=32, shuffle=True)

EPOCHS = 100

enc = EncoderWithAttention(len(vocab), emb_dim=128, hidden_dim=256)
dec = DecoderWithAttention(len(vocab), emb_dim=128, hidden_dim=256)
model = Seq2SeqWithAttention(enc, dec, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [86]:
training = Train(model, train_loader, optimizer, criterion, DEVICE, BASE_PATH)
training.run(EPOCHS)

Epoch 0: loss 5.5054
Epoch 1: loss 4.2628
Epoch 2: loss 3.7907
Epoch 3: loss 3.5048
Epoch 4: loss 3.2970
Epoch 5: loss 3.1314
Epoch 6: loss 3.0078
Epoch 7: loss 2.9046
Epoch 8: loss 2.8199
Epoch 9: loss 2.7373
Epoch 10: loss 2.6716
Epoch 11: loss 2.6126
Epoch 12: loss 2.5549
Epoch 13: loss 2.4973
Epoch 14: loss 2.4592
Epoch 15: loss 2.4119
Epoch 16: loss 2.3736
Epoch 17: loss 2.3388
Epoch 18: loss 2.3041
Epoch 19: loss 2.2682
Epoch 20: loss 2.2397
Epoch 21: loss 2.2111
Epoch 22: loss 2.1889
Epoch 23: loss 2.1668
Epoch 24: loss 2.1367
Epoch 25: loss 2.1147
Epoch 26: loss 2.0920
Epoch 27: loss 2.0685
Epoch 28: loss 2.0511
Epoch 29: loss 2.0342
Epoch 30: loss 2.0166
Epoch 31: loss 1.9973
Epoch 32: loss 1.9809
Epoch 33: loss 1.9662
Epoch 34: loss 1.9491
Epoch 35: loss 1.9349
Epoch 36: loss 1.9267
Epoch 37: loss 1.9101
Epoch 38: loss 1.8947
Epoch 39: loss 1.8851
Epoch 40: loss 1.8702
Epoch 41: loss 1.8595
Epoch 42: loss 1.8472
Epoch 43: loss 1.8415
Epoch 44: loss 1.8306
Epoch 45: loss 1.815

In [None]:
from tqdm import tqdm

class Validator:
    def __init__(self, model, path, device):
        self._model = model
        self._path = path
        self._device = device
    
    def run(self, dataloader):
        self._model.load_state_dict(torch.load(self._path, weights_only=True))
        self._model.eval()
        
        with torch.no_grad():
            for batch in tqdm(dataloader):
                src = batch["src"].to(self._device)
                tgt = batch["tgt"]  # jako text, pro referenci

                # předpověď modelem (např. greedy decoding)
                pred_ids = model.greedy_decode(src, max_len=50)  # nebo beam_search
                pred_sentences = tokenizer_tgt.decode_batch(pred_ids)

                predictions.extend(pred_sentences)
                references.extend(batch["tgt_text"])  # originální věty jako list

    return predictions, references
                
        
        

In [87]:
class Evaluator:
    def __init__(self, model, vocab, device, path, max_len=30):
        self._model = model
        self._vocab = vocab
        self._device = device
        self._max_len = max_len
        self._path = path
        
        self._model.load_state_dict(torch.load("%s/best_model.pt" % (self._path), weights_only=True))
    
    def simplify(self, sentence):
        self._model.eval()
        tokens = Vocab.tokenize(sentence)
        
        print("Tokens:")
        print(tokens)
        print([vocab.word2idx.get(tok, vocab.word2idx["<unk>"]) for tok in tokens])
        
        indices = [self._vocab.word2idx["<sos>"]] + [self._vocab.word2idx.get(tok, self._vocab.word2idx["<unk>"]) for tok in tokens] + [self._vocab.word2idx["<eos>"]]

        src_tensor = torch.LongTensor(indices).unsqueeze(0).to(self._device)
        print(src_tensor)
        
        with torch.no_grad():
            encoder_output, hidden, cell = self._model.encoder(src_tensor)
            input_token = torch.LongTensor([self._vocab.word2idx["<sos>"]]).to(self._device)
            outputs = []

            for _ in range(self._max_len):
                output, hidden, cell, _ = self._model.decoder(input_token, hidden, cell, encoder_output)
                top1 = output.argmax(1)
                word = self._vocab.idx2word.get(top1.item(), "<unk>")
                if word == "<eos>":
                    break
                outputs.append(word)
                input_token = top1

        return " ".join(outputs)
        

In [88]:
eval = Evaluator(model, vocab, DEVICE, BASE_PATH)

sentence = "Alessandro -LRB- '' Sandro '' -RRB- Mazzola -LRB- born 8 November 1942 -RRB- is an Italian former football player ."
eval.simplify(sentence)

Tokens:
['alessandro', 'sandro', 'mazzola', 'born', '8', 'november', '1942', 'is', 'an', 'italian', 'former', 'football', 'player', '.']
[6483, 33141, 100289, 597, 197, 1476, 3708, 5, 162, 467, 2765, 186, 668, 33]
tensor([[     1,   6483,  33141, 100289,    597,    197,   1476,   3708,      5,
            162,    467,   2765,    186,    668,     33,      2]],
       device='cuda:0')


'math mazzola born 8 november 1942 is a former italian football player .'

In [None]:
tokens = Vocab.tokenize("Alessandro -LRB- '' Sandro '' -RRB- Mazzola -LRB- born 8 November 1942 -RRB- is an Italian former football player .")
print("Tokens:")
print(tokens)
print([vocab.word2idx.get(tok, vocab.word2idx["<unk>"]) for tok in tokens])

['alessandro', 'lrb', 'sandro', 'rrb', 'mazzola', 'lrb', 'born', '8', 'november', '1942', 'rrb', 'is', 'an', 'italian', 'former', 'football', 'player', '.']
[6485, 68, 3, 74, 3, 68, 599, 199, 1478, 3710, 74, 5, 164, 469, 2767, 188, 670, 33]


## Výsledek po 100 epochách nad celým korpusem:
Doba běhu: 4743 minut
```
Epoch 0: loss 5.5054
Epoch 1: loss 4.2628
Epoch 2: loss 3.7907
Epoch 3: loss 3.5048
Epoch 4: loss 3.2970
Epoch 5: loss 3.1314
Epoch 6: loss 3.0078
Epoch 7: loss 2.9046
Epoch 8: loss 2.8199
Epoch 9: loss 2.7373
Epoch 10: loss 2.6716
Epoch 11: loss 2.6126
Epoch 12: loss 2.5549
Epoch 13: loss 2.4973
Epoch 14: loss 2.4592
Epoch 15: loss 2.4119
Epoch 16: loss 2.3736
Epoch 17: loss 2.3388
Epoch 18: loss 2.3041
Epoch 19: loss 2.2682
Epoch 20: loss 2.2397
Epoch 21: loss 2.2111
Epoch 22: loss 2.1889
Epoch 23: loss 2.1668
Epoch 24: loss 2.1367
Epoch 25: loss 2.1147
Epoch 26: loss 2.0920
Epoch 27: loss 2.0685
Epoch 28: loss 2.0511
Epoch 29: loss 2.0342
Epoch 30: loss 2.0166
Epoch 31: loss 1.9973
Epoch 32: loss 1.9809
Epoch 33: loss 1.9662
Epoch 34: loss 1.9491
Epoch 35: loss 1.9349
Epoch 36: loss 1.9267
Epoch 37: loss 1.9101
Epoch 38: loss 1.8947
Epoch 39: loss 1.8851
Epoch 40: loss 1.8702
Epoch 41: loss 1.8595
Epoch 42: loss 1.8472
Epoch 43: loss 1.8415
Epoch 44: loss 1.8306
Epoch 45: loss 1.8155
Epoch 46: loss 1.8100
Epoch 47: loss 1.7983
Epoch 48: loss 1.7897
Epoch 49: loss 1.7767
Epoch 50: loss 1.7721
Epoch 51: loss 1.7645
Epoch 52: loss 1.7591
Epoch 53: loss 1.7454
Epoch 54: loss 1.7382
Epoch 55: loss 1.7269
Epoch 56: loss 1.7225
Epoch 57: loss 1.7170
Epoch 58: loss 1.7124
Epoch 59: loss 1.7022
Epoch 60: loss 1.6959
Epoch 61: loss 1.6866
Epoch 62: loss 1.6756
Epoch 63: loss 1.6733
Epoch 64: loss 1.6677
Epoch 65: loss 1.6675
Epoch 66: loss 1.6599
Epoch 67: loss 1.6546
Epoch 68: loss 1.6538
Epoch 69: loss 1.6422
Epoch 70: loss 1.6412
Epoch 71: loss 1.6447
Epoch 72: loss 1.6308
Epoch 73: loss 1.6223
Epoch 74: loss 1.6157
Epoch 75: loss 1.6158
Epoch 76: loss 1.6082
Epoch 77: loss 1.6075
Epoch 78: loss 1.6004
Epoch 79: loss 1.5969
Epoch 80: loss 1.5984
Epoch 81: loss 1.5909
Epoch 82: loss 1.5857
Epoch 83: loss 1.5805
Epoch 84: loss 1.5828
Epoch 85: loss 1.5810
Epoch 86: loss 1.5753
Epoch 87: loss 1.5710
Epoch 88: loss 1.5686
Epoch 89: loss 1.5652
Epoch 90: loss 1.5590
Epoch 91: loss 1.5521
Epoch 92: loss 1.5513
Epoch 93: loss 1.5494
Epoch 94: loss 1.5478
Epoch 95: loss 1.5431
Epoch 96: loss 1.5360
Epoch 97: loss 1.5325
Epoch 98: loss 1.5347
Epoch 99: loss 1.5293
Finished with epoch 99 with loss 1.5293
```

```
"Alessandro -LRB- '' Sandro '' -RRB- Mazzola -LRB- born 8 November 1942 -RRB- is an Italian former football player ."
```

```
['alessandro', 'lrb', 'sandro', 'rrb', 'mazzola', 'lrb', 'born', '8', 'november', '1942', 'rrb', 'is', 'an', 'italian', 'former', 'football', 'player', '.']
[6485, 68, 3, 74, 3, 68, 599, 199, 1478, 3710, 74, 5, 164, 469, 2767, 188, 670, 33]
```
Tokens:
['alessandro', 'sandro', 'mazzola', 'born', '8', 'november', '1942', 'is', 'an', 'italian', 'former', 'football', 'player', '.']
[6483, 33141, 100289, 597, 197, 1476, 3708, 5, 162, 467, 2765, 186, 668, 33]
tensor([[     1,   6483,  33141, 100289,    597,    197,   1476,   3708,      5,
            162,    467,   2765,    186,    668,     33,      2]],
       device='cuda:0')
'math mazzola born 8 november 1942 is a former italian football player .'
```