<a href="https://colab.research.google.com/github/sckwokyboom/Cybersemiotics/blob/main/Reverse_and_sort_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from ast import Module
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import nn
from functools import partial
import torch.optim as optim
import torch.nn as nn
import os
import math
import numpy as np
try:
  import pytorch_lightning as pl
except ModuleNotFoundError:
  !pip install pytorch-lightning
  import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.2-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.1.0->pytorch-lightning)
  Downloadi

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=5000):
    """
    Входы
      d_model - Скрытая размерность входных данных.
      max_len - Максимальная длина ожидаемой последовательности.
    """
    super().__init__()
    # Создать матрицу [SeqLen, HiddenDim], представляющую
    # позиционное кодирование для входов max_len.
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    # register_buffer => Tensor, который не является
    # параметром, но должен быть частью состояния модуля.
    # Используется для тензоров, которые должны находиться
    # на том же устройстве, что и модуль.
    # persistent=False указывает PyTorch не добавлять буфер
    # в словаь состояний (например, когда мы сохраняем
    # модель).
    self.register_buffer('pe', pe, persistent=False)

  def forward(self, x):
    x = x + self.pe[:, :x.size(1)]
    return x

In [4]:
def scaled_dot_product(q, k, v, mask=None):
  d_k = q.size()[-1]
  attn_logits = torch.matmul(q, k.transpose(-2, -1))
  attn_logits = attn_logits / math.sqrt(d_k)
  if mask is not None:
    attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
  attention = F.softmax(attn_logits, dim=-1)
  values = torch.matmul(attention, v)
  return values, attention

In [5]:
class MultiheadAttention(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads):
        """
        Входы:
          input_dim  – размер входных признаков (d_model у предыдущего слоя).
          embed_dim  – общая скрытая размерность после проекции QKV.
          num_heads  – число «голов» внимания.
        """
        super().__init__()
        assert embed_dim % num_heads == 0, \
            "Embedding dimension must be divisible by number of heads."

        self.embed_dim  = embed_dim
        self.num_heads  = num_heads
        self.head_dim   = embed_dim // num_heads  # размерность одной головы

        # Общая проекция для Q, K и V: [B, T, input_dim] → [B, T, 3·embed_dim]
        self.qkv_proj = nn.Linear(input_dim, 3 * embed_dim, bias=True)

        # Финальный линейный слой для выходов внимания
        self.o_proj   = nn.Linear(embed_dim, embed_dim, bias=True)

        self._reset_parameters()

    # --- инициализация ------------------------------------------------------
    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        nn.init.zeros_(self.qkv_proj.bias)
        nn.init.xavier_uniform_(self.o_proj.weight)
        nn.init.zeros_(self.o_proj.bias)

    # --- прямой проход ------------------------------------------------------
    def forward(self, x, mask=None, return_attention=False):
        """
        x   : Tensor формы [B, T, input_dim]
        mask: Tensor формы [B, 1, 1, T] или [B, 1, T, T] (опц.)
        """
        batch_size, seq_len, _ = x.size()

        # 1. Линейная проекция разом для Q, K, V
        qkv = self.qkv_proj(x)                                    # [B, T, 3·E]

        # 2. Делим на головы и отделяем Q, K, V
        qkv = qkv.reshape(batch_size, seq_len, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)                             # [B, H, T, 3·D_head]
        q, k, v = qkv.chunk(3, dim=-1)                            # три тензора [B, H, T, D_head]

        # 3. Self-Attention (scaled dot-product)
        values, attention = scaled_dot_product(q, k, v, mask=mask)  # values: [B, H, T, D_head]

        # 4. Склеиваем головы обратно
        values = values.permute(0, 2, 1, 3)                       # [B, T, H, D_head]
        values = values.reshape(batch_size, seq_len, self.embed_dim)

        # 5. Финальная линейная проекция
        out = self.o_proj(values)                                 # [B, T, E]

        if return_attention:
            return out, attention                                 # attention: [B, H, T, T]
        return out


In [6]:
class EncoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
        """
        Входы:
          input_dim - Размер входных данных.
          num_heads - Количество голов в Multihead Attention.
          dim_feedforward - Размер скрытого слоя в MLP.
          dropout - Вероятность Dropout в слоях.
        """
        super().__init__()

        # Многоголовое внимание (Self-Attention)
        self.self_attn = MultiheadAttention(
            input_dim,
            input_dim,
            num_heads
        )

        # Двуслойная FeedForward-сеть (MLP)
        self.linear_net = nn.Sequential(
            nn.Linear(input_dim, dim_feedforward),
            nn.Dropout(dropout),
            nn.ReLU(inplace=True),
            nn.Linear(dim_feedforward, input_dim)
        )

        # Нормализация и Dropout
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Внимание с residual-связью
        attn_out = self.self_attn(x, mask=mask)
        x = x + self.dropout(attn_out)
        x = self.norm1(x)

        # Feedforward + residual
        linear_out = self.linear_net(x)
        x = x + self.dropout(linear_out)
        x = self.norm2(x)

        return x


In [7]:
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, **block_args):
        super().__init__()
        self.layers = nn.ModuleList([
            EncoderBlock(**block_args)
            for _ in range(num_layers)
        ])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask=mask)
        return x

    def get_attention_maps(self, x, mask=None):
        attention_maps = []
        for layer in self.layers:
            _, attn_map = layer.self_attn(x, mask=mask, return_attention=True)
            attention_maps.append(attn_map)
            x = layer(x)
        return attention_maps

In [8]:
class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):
  def __init__(self, optimizer, warmup, max_iters):
    self.warmup = warmup
    self.max_num_iters = max_iters
    super().__init__(optimizer)

  def get_lr(self):
    lr_factor = self.get_lr_factor(epoch=self.last_epoch)
    return [base_lr * lr_factor for base_lr in self.base_lrs]

  def get_lr_factor(self, epoch):
    lr_factor = 0.5 * (1 + np.cos(np.pi * epoch /
    self.max_num_iters))
    if epoch <= self.warmup:
      lr_factor *= epoch * 1.0 / self.warmup
    return lr_factor

In [9]:
class TransformerPredictor(pl.LightningModule):
  def __init__(self, input_dim, model_dim, num_classes, num_heads, num_layers, lr, warmup, max_iters, dropout=0.0, input_dropout=0.0):
    """
    Входы:
      input_dim - Скрытая размерность входных данных.
      model_dim - Скрытая размерность для использования внутри трансформера.
      num_classes - Число классов.
      num_heads - Количество голов для использования в блоках многоголового внимания.
      num_layers - Количество используемых блоков кодировщика.
      lr - Скорость обучения в оптимизаторе.
      warmup - Количество шагов разогрева. Обычно от 50 до 500.
      max_iters - Количество максимальных итераций, за которые обучается модель. Необходимо для планировщика CosineWarmup.
      dropout - Dropout для применения внутри модели.
      input_dropout - Dropout для применения к входным признакам.
    """
    super().__init__()
    self.save_hyperparameters()
    self._create_model()

  def _create_model(self):
    self.input_net = nn.Sequential(
      nn.Dropout(self.hparams.input_dropout),
      nn.Linear(self.hparams.input_dim, self.hparams.model_dim)
    )
    # Позиционное кодирование.
    self.positional_encoding = PositionalEncoding(d_model=self.hparams.model_dim)
    # Трансформер
    self.transformer = TransformerEncoder(num_layers=self.hparams.num_layers, input_dim=self.hparams.model_dim, dim_feedforward=2*self.hparams.model_dim, num_heads=self.hparams.num_heads, dropout=self.hparams.dropout)
    # Выходной классификатор.
    self.output_net = nn.Sequential(
      nn.Linear(self.hparams.model_dim, self.hparams.model_dim),
      nn.LayerNorm(self.hparams.model_dim),
      nn.ReLU(inplace=True),
      nn.Dropout(self.hparams.dropout),
      nn.Linear(self.hparams.model_dim, self.hparams.num_classes)
    )

  def forward(self, x, mask=None, add_positional_encoding=True):
    """
    Входы:
      x - Входные признаки формы [Batch, SeqLen, input_dim].
      mask - Маска для применения к выводам внимания (необязательно).
      add_positional_encoding - Если True, добавляем позиционное кодирование к входным данным. Может быть нежелательно для некоторых задач.
    """
    x = self.input_net(x)
    if add_positional_encoding:
      x = self.positional_encoding(x)
      x = self.transformer(x, mask=mask)
      x = self.output_net(x)
    return x

  @torch.no_grad()
  def get_attention_maps(self, x, mask=None, add_positional_encoding=True):
    """
    Функция для извлечения матриц внимания всего трансформера
    для одного пакета.
    Входные аргументы те же, что и при прямом проходе.
    """
    x = self.input_net(x)
    if add_positional_encoding:
      x = self.positional_encoding(x)
      attention_maps = self.transformer.get_attention_maps(x, mask=mask)
    return attention_maps

  def configure_optimizers(self):
    optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr)
    # Применение планировщика lr на каждом шаге.
    lr_scheduler = CosineWarmupScheduler(optimizer, warmup=self.hparams.warmup, max_iters=self.hparams.max_iters)
    return [optimizer], [{'scheduler': lr_scheduler, 'interval': 'step'}]

  def training_step(self, batch, batch_idx):
    raise NotImplementedError

  def validation_step(self, batch, batch_idx):
    raise NotImplementedError

  def test_step(self, batch, batch_idx):
    raise NotImplementedError

In [10]:
class ReversePredictor(TransformerPredictor):
  def _calculate_loss(self, batch, mode="train"):
    # Получение данных и преобразование категорий в
    # унитарные векторы/
    inp_data, labels = batch
    inp_data = F.one_hot(inp_data, num_classes=self.hparams.num_classes).float()
    # Выполнение прогнозирования и рассчет потери
    # и Accuracy.
    preds = self.forward(inp_data, add_positional_encoding=True)
    loss = F.cross_entropy(preds.view(-1,preds.size(-1)), labels.view(-1))
    acc = (preds.argmax(dim=-1) == labels).float().mean()
    # Логирование.
    self.log(f"{mode}_loss", loss)
    self.log(f"{mode}_acc", acc)
    return loss, acc

  def training_step(self, batch, batch_idx):
    loss, _ = self._calculate_loss(batch, mode="train")
    return loss

  def validation_step(self, batch, batch_idx):
    _ = self._calculate_loss(batch, mode="val")

  def test_step(self, batch, batch_idx):
    _ = self._calculate_loss(batch, mode="test")

In [11]:
class ReverseDataset(Dataset):
  def __init__(self, num_categories, seq_len, size):
    super().__init__()
    self.num_categories = num_categories
    self.seq_len = seq_len
    self.size = size
    self.data = torch.randint(self.num_categories, size=(self.size, self.seq_len))

  def __len__(self):
    return self.size

  def __getitem__(self, idx):
    inp_data = self.data[idx]
    labels = torch.flip(inp_data, dims=(0,))
    return inp_data, labels

In [15]:
dataset = partial(ReverseDataset, 10, 16)
train_loader = DataLoader(dataset(50000), batch_size=128,
shuffle=True, drop_last=True, pin_memory=True)
val_loader = DataLoader(dataset(1000), batch_size=128)
test_loader = DataLoader(dataset(10000), batch_size=128)

inp_data, labels = train_loader.dataset[49999]
print("Вход:", inp_data)
print("Метки: ", labels)

Вход: tensor([7, 5, 0, 1, 0, 4, 0, 4, 9, 8, 6, 4, 2, 0, 7, 4])
Метки:  tensor([4, 7, 0, 2, 4, 6, 8, 9, 4, 0, 4, 0, 1, 0, 5, 7])


In [16]:
dataset = ReverseDataset(num_categories=5, seq_len=6, size=10)
print("Размер датасета:", len(dataset))
x, y = dataset[9]
print("Вход: ", x.tolist())
print("Метка (разворот):", y.tolist())

Размер датасета: 10
Вход:  [3, 1, 4, 0, 4, 1]
Метка (разворот): [1, 4, 0, 4, 1, 3]


In [17]:
def train_reverse(**kwargs):
    root_dir = os.path.join("/content/Checkpoint", "ReverseTask")
    os.makedirs(root_dir, exist_ok=True)

    trainer = pl.Trainer(
        default_root_dir=root_dir,
        callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")],
        accelerator="gpu" if str(device).startswith("cuda") else "cpu", devices=1,
        max_epochs=10,
        gradient_clip_val=5
    )
    trainer.logger._default_hp_metric = None

    model = ReversePredictor(max_iters=trainer.max_epochs * len(train_loader), **kwargs)
    trainer.fit(model, train_loader, val_loader)

    # Правильно вызвать валидацию (а не test)
    val_metrics = trainer.validate(model, val_loader, verbose=False)[0]
    test_metrics = trainer.test(model, test_loader, verbose=False)[0]

    result = {
        "val_loss": val_metrics["val_loss"],
        "val_acc":  val_metrics["val_acc"],
        "test_loss": test_metrics["test_loss"],
        "test_acc":  test_metrics["test_acc"]
    }

    model = model.to(device)
    return model, result


In [18]:
reverse_model, reverse_result = train_reverse(
    input_dim=train_loader.dataset.num_categories,
    model_dim=32,
    num_heads=1,
    num_classes=train_loader.dataset.num_categories,
    num_layers=1,
    dropout=0.0,
    lr=5e-4,
    warmup=50
)
print(reverse_result)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | input_net           | Sequential         | 352    | train
1 | positional_encoding | PositionalEncoding | 0      | train
2 | transformer         | TransformerEncoder | 8.5 K  | train
3 | output_net          | Sequential         | 1.4 K  | train
-------------------------------------------------------------------
10.3 K    Trainable params
0         Non-trainable params
10.3 K    Total params
0.041     Total estimated model params size (MB)
24        Modules in train mode
0         Module

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

{'val_loss': 0.0016588533762842417, 'val_acc': 1.0, 'test_loss': 0.0016902446513995528, 'test_acc': 1.0}


In [19]:
seq_len = 16
num_classes = 10  # как при обучении
input_seq = torch.randint(0, num_classes, (seq_len,))  # [16]
one_hot_input = F.one_hot(input_seq, num_classes=num_classes).float().unsqueeze(0).to(device)

reverse_model.eval()
with torch.no_grad():
    logits = reverse_model(one_hot_input)
    prediction = logits.argmax(dim=-1).squeeze()

print("Вход:      ", input_seq.tolist())
print("Ожидаемо:  ", input_seq.flip(0).tolist())
print("Предсказан:", prediction.tolist())

Вход:       [4, 3, 9, 6, 1, 6, 3, 0, 1, 6, 6, 3, 2, 1, 7, 1]
Ожидаемо:   [1, 7, 1, 2, 3, 6, 6, 1, 0, 3, 6, 1, 6, 9, 3, 4]
Предсказан: [1, 7, 1, 2, 3, 6, 6, 1, 0, 3, 6, 1, 6, 9, 3, 4]


In [20]:
class SortPredictor(TransformerPredictor):
  def _calculate_loss(self, batch, mode="train"):
    # Получение данных и преобразование категорий в
    # унитарные векторы/
    inp_data, labels = batch
    inp_data = F.one_hot(inp_data, num_classes=self.hparams.num_classes).float()
    # Выполнение прогнозирования и рассчет потери
    # и Accuracy.
    preds = self.forward(inp_data, add_positional_encoding=True)
    loss = F.cross_entropy(preds.view(-1,preds.size(-1)), labels.view(-1))
    acc = (preds.argmax(dim=-1) == labels).float().mean()
    # Логирование.
    self.log(f"{mode}_loss", loss)
    self.log(f"{mode}_acc", acc)
    return loss, acc

  def training_step(self, batch, batch_idx):
    loss, _ = self._calculate_loss(batch, mode="train")
    return loss

  def validation_step(self, batch, batch_idx):
    _ = self._calculate_loss(batch, mode="val")

  def test_step(self, batch, batch_idx):
    _ = self._calculate_loss(batch, mode="test")

In [21]:
class SortDataset(Dataset):
  def __init__(self, num_categories, seq_len, size):
    super().__init__()
    self.num_categories = num_categories
    self.seq_len = seq_len
    self.size = size
    self.data = torch.randint(self.num_categories, size=(self.size, self.seq_len))

  def __len__(self):
    return self.size

  def __getitem__(self, idx):
    inp_data = self.data[idx]
    labels = torch.sort(inp_data)[0]
    return inp_data, labels

In [24]:
dataset = partial(SortDataset, 10, 16)
train_loader = DataLoader(dataset(50000), batch_size=128, shuffle=True, drop_last=True, pin_memory=True)
val_loader = DataLoader(dataset(1000), batch_size=128)
test_loader = DataLoader(dataset(10000), batch_size=128)

inp_data, labels = train_loader.dataset[49999]
print("Вход:", inp_data)
print("Метки: ", labels)

dataset = SortDataset(num_categories=5, seq_len=6, size=10)
print("Размер датасета:", len(dataset))
x, y = dataset[9]
print("Вход: ", x.tolist())
print("Метка (разворот):", y.tolist())

Вход: tensor([6, 8, 0, 6, 6, 1, 5, 4, 7, 4, 9, 0, 3, 8, 3, 9])
Метки:  tensor([0, 0, 1, 3, 3, 4, 4, 5, 6, 6, 6, 7, 8, 8, 9, 9])
Размер датасета: 10
Вход:  [3, 2, 0, 0, 0, 2]
Метка (разворот): [0, 0, 0, 2, 2, 3]


In [25]:
def train_sort(**kwargs):
    root_dir = os.path.join("/content/Checkpoint", "SortTask")
    os.makedirs(root_dir, exist_ok=True)

    trainer = pl.Trainer(
        default_root_dir=root_dir,
        callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")],
        accelerator="gpu" if str(device).startswith("cuda") else "cpu", devices=1, max_epochs=10,
        gradient_clip_val=5
    )

    trainer.logger._default_hp_metric = None
    model = SortPredictor(max_iters=trainer.max_epochs * len(train_loader), **kwargs)

    trainer.fit(model, train_loader, val_loader)

    val_metrics = trainer.validate(model, val_loader, verbose=False)[0]
    test_metrics = trainer.test(model, test_loader, verbose=False)[0]

    result = {
        "val_loss": val_metrics["val_loss"],
        "val_acc":  val_metrics["val_acc"],
        "test_loss": test_metrics["test_loss"],
        "test_acc":  test_metrics["test_acc"]
    }

    model = model.to(device)
    return model, result


In [26]:
sort_model, sort_result = train_sort(
    input_dim=train_loader.dataset.num_categories,
    model_dim=32,
    num_heads=1,
    num_classes=train_loader.dataset.num_categories,
    num_layers=1,
    dropout=0.0,
    lr=5e-4,
    warmup=50
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | input_net           | Sequential         | 352    | train
1 | positional_encoding | PositionalEncoding | 0      | train
2 | transformer         | TransformerEncoder | 8.5 K  | train
3 | output_net          | Sequential         | 1.4 K  | train
-------------------------------------------------------------------
10.3 K    Trainable params
0         Non-trainable params
10.3 K    Total params
0.041     Total estimated model params size (MB)
24        Modules in train mode
0         Module

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

In [27]:
print(sort_result)

{'val_loss': 0.012117278762161732, 'val_acc': 0.9991875290870667, 'test_loss': 0.012280023656785488, 'test_acc': 0.9988874793052673}


In [28]:
seq_len = 16
num_classes = 10  # как при обучении
input_seq = torch.randint(0, num_classes, (seq_len,))  # [16]
one_hot_input = F.one_hot(input_seq, num_classes=num_classes).float().unsqueeze(0).to(device)

sort_model.eval()
with torch.no_grad():
    logits = sort_model(one_hot_input)
    prediction = logits.argmax(dim=-1).squeeze()

print("Вход:      ", input_seq.tolist())
print("Ожидаемо:  ", input_seq.sort()[0].tolist())
print("Предсказан:", prediction.tolist())

Вход:       [2, 7, 3, 5, 8, 5, 9, 6, 3, 8, 4, 0, 3, 3, 7, 9]
Ожидаемо:   [0, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9, 9]
Предсказан: [0, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9, 9]
