In [47]:
alphabet = 'йцукенгшщзхъфывапролджэячсмитьбю '

idx2char = {0: '§', 1: 'SOS', 2: 'EOS'}
char2idx = {'§': 0, 'SOS': 1, 'EOS': 2}

# заполняем словари

for j in range(len(alphabet)):
  char2idx[alphabet[j]] = j + 3
  idx2char[j + 3] = alphabet[j]

In [170]:
def filter_sentence(sentence):
  """
  Filter sentence from unnecessary symbols. 
  """
  return ''.join(list(filter(list(char2idx).__contains__, sentence)))

def sentence2nums(sentence):
  """
  Map sentence to vector of nums.
  """
  return [char2idx[x] for x in sentence]

def nums2sentence(nums):
  """
  Map vector of numbers to sentence.
  """
  return ''.join([idx2char[x] for x in nums])

In [171]:
import random

def add_mistakes(sentence):
  """
  Add mistakes to sentence.
  """
  sentence = list(sentence)
  # delete random symbol
  if random.random() > 0.5:
    idx = random.randint(0, len(sentence) - 1)
    sentence = sentence[:idx] + sentence[idx + 1:]
  # replace random symbol with random symbol
  if random.random() > 0.5:
    idx = random.randint(0, len(sentence) - 1)
    idx2 = random.randint(0, len(alphabet) - 1)
    sentence[idx] = alphabet[idx2]
  # replace two random symbols
  if random.random() > 0.5:
    idx = random.randint(0, len(sentence) - 1)
    idx2 = random.randint(0, len(sentence) - 1)
    sentence[idx], sentence[idx2] = sentence[idx2], sentence[idx]
  # delete random whitespace
  if random.random() > 0.5:
    idxs = []
    for i, s in enumerate(sentence):
      if s == ' ':
        idxs.append(i)
    idx = random.choice(idxs)
    sentence = sentence[:idx] + sentence[idx + 1:]
  return ''.join(sentence)

In [167]:
add_mistakes('родился  октября  года в послке бугач  пригороде красноярска')

'родился  октября  года в послке бугач  пригороде красноярска'

In [177]:
import torch

def form_batches(file, batch_size, num_of_words):
  """Form batches and targets from txt file.
  Every batch has 'batch_size' sentences.
  Every sentence has about 'num_of_words' words.

  Parameters
  ----------
  file: _io.TextIOWrapper
      File that contains text
  batch_size: int
      Size of batch
  num_of_words: int
      Number of words in every sentence
  
  Returns
  -------
  batches: list of lists of torch.tensors
      Created batches
  batches_lengths: list of lists of lists of ints
      Lengths of created sentences
  targets: list of lists of torch.tensors
      Created targets (same to batches but without mistakes)
  targets_lengths: list of lists of lists of ints
      Lengths of target sentences
  """
  # read file
  lines = []
  for line in file:
    line = line.strip()
    if len(line) > 0:
      line = line.lower()
      line = filter_sentence(line)
      lines.append(line)
  lines = ' '.join(lines)
  # create batches
  batches_lengths = []
  targets_lengths = []
  target_batch_lengths = []
  batch_lengths = []
  targets = []
  target_batch = []
  batches = []
  cnt = 0
  one_sentence = []
  batch = []
  for symbol in lines:
    if symbol == ' ':
      cnt += 1
    one_sentence.append(symbol)
    if cnt == num_of_words + 1:
      # one sentence is full
      cnt = 0
      one_sentence = ''.join(one_sentence)
      target_batch.append(torch.tensor(sentence2nums(one_sentence)))
      target_batch_lengths.append(len(one_sentence))
      one_sentence = add_mistakes(one_sentence)
      one_sentence = add_mistakes(one_sentence)
      batch.append(torch.tensor(sentence2nums(one_sentence)))
      batch_lengths.append(len(one_sentence))
      one_sentence = []
      if len(batch) == batch_size:
        # one batch is full
        batches.append(batch)
        batch = []
        targets.append(target_batch)
        target_batch = []
        batches_lengths.append(batch_lengths)
        targets_lengths.append(target_batch_lengths)
        batch_lengths = []
        target_batch_lengths = []
  return batches, batches_lengths, targets, targets_lengths

In [178]:
bs, in_lengths, ts, t_lengths = form_batches(open('idiot.txt', 'r'), 50, 15)

In [179]:
a = torch.nn.utils.rnn.pad_sequence(bs[0], batch_first=True,)

In [180]:
t = torch.nn.utils.rnn.pad_sequence(ts[0], batch_first=True,)

In [158]:
t_lengths[0][2]

105

In [159]:
t[2].shape

torch.Size([130])

In [160]:
nums2sentence(t[2].tolist())

'влево от дороги трудно было разглядеть хоть чтонибудь из окон вагона из пассажиров были и возвращавшиеся §§§§§§§§§§§§§§§§§§§§§§§§§'

In [161]:
nums2sentence(a[2].tolist())

'влевоот дорогижрудно было разглядеть хоть чтонибудь из окон вагна из пассатиров былиги возвращавшиеся§§§§§§§§§§§§§§§§§§§§§§§§§§'