In [1]:
%pip install sacrebleu sentencepiece torch datasets==3.6.0 scipy tqdm numpy tensorboard optuna

[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
# imports

from __future__ import annotations

import json
import math
import random
import time
from datetime import timedelta
from pathlib import Path
from typing import List, Tuple

import numpy as np
import sacrebleu
import sentencepiece as spm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as tud
from torch.optim.lr_scheduler import StepLR
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.tensorboard import SummaryWriter
from datasets import load_dataset
from scipy import stats
from tqdm.auto import tqdm
import optuna

In [2]:
# Model definitions

class LuongAttention(nn.Module):
  def __init__(self, hidden_size: int):
    super().__init__()
    self.scale = 1.0 / math.sqrt(hidden_size)

  def forward(self, query, keys, values, mask=None):
    # query: (B, 1, H); keys: (B, T, H)
    scores = torch.bmm(query, keys.transpose(1, 2)) * self.scale  # (B,1,T)
    if mask is not None:
      scores = scores.masked_fill_(~mask[:, None, :], -1e9)
    attn = torch.softmax(scores, dim=-1)  # (B,1,T)
    context = torch.bmm(attn, values)  # (B,1,H)
    return context, attn.squeeze(1)

class BiLSTMTranslator(nn.Module):
    """
    2-layer bidirectional LSTM encoder + 2-layer unidirectional LSTM decoder
    with Luong global attention.

    The final forward & backward encoder states are concatenated, then
    replicated across decoder layers so the initial (h_0, c_0) have shape
    (num_layers, batch, hidden_size), as required by nn.LSTM.
    """
    def __init__(
        self,
        # These arguments will be supplied by Optuna. Values here are placeholders
        vocab_size: int,
        emb_size: int = 512,
        hidden_size: int = 512,
        num_layers: int = 2,
        dropout: float = 0.1,
        **kwargs: dict,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.encoder = nn.LSTM(
            input_size=emb_size,
            hidden_size=hidden_size // 2,
            num_layers=num_layers,
            dropout=dropout,
            bidirectional=True,
            batch_first=True,
        )

        self.decoder = nn.LSTM(
            input_size=emb_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True,
        )

        self.attn = LuongAttention(hidden_size)
        self.out = nn.Linear(hidden_size * 2, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_lens, tgt):
        # encoder
        emb_src = self.dropout(self.embedding(src))
        packed_src = nn.utils.rnn.pack_padded_sequence(
            emb_src, src_lens.cpu(), batch_first=True, enforce_sorted=False
        )
        enc_out, (h_enc, c_enc) = self.encoder(packed_src)
        enc_out, _ = nn.utils.rnn.pad_packed_sequence(enc_out, batch_first=True)
        # h_enc & c_enc: (num_layers*2, batch, hidden_size//2)
    
        # Concatenate last forward & backward states -> (batch, hidden_size)
        h_final = torch.cat([h_enc[-2], h_enc[-1]], dim=-1)
        c_final = torch.cat([c_enc[-2], c_enc[-1]], dim=-1)
    
        # Expand to match decoder layers: (num_layers, batch, hidden_size)
        num_dec_layers = self.decoder.num_layers
        h0 = h_final.unsqueeze(0).repeat(num_dec_layers, 1, 1)
        c0 = c_final.unsqueeze(0).repeat(num_dec_layers, 1, 1)
    
        # decoder
        emb_tgt = self.dropout(self.embedding(tgt))
        dec_out, _ = self.decoder(emb_tgt, (h0, c0)) # (B, T, H)
    
        # attention
        context, _ = self.attn(dec_out, enc_out, enc_out) # (B, T, H)
        concat = torch.cat([dec_out, context], dim=-1) # (B, T, 2H)
        logits = self.out(concat) # (B, T, V)
        return logits


class PositionalEncoding(nn.Module):
    """
    Implements sinusoidal positional encoding as described in "Attention Is All You Need".
    """
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        # Create constant "pe" matrix with values dependent on
        # pos and i
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2, dtype=torch.float)
            * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape (1, max_len, d_model)

        # Register as buffer so it's saved/loaded but not trained
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape (batch_size, seq_len, d_model)
        """
        x = x + self.pe[:, : x.size(1)]
        return self.dropout(x)


class TransformerTranslator(nn.Module):
    def __init__(
        self,
        # These arguments will be supplied by Optuna. Values here are placeholders
        vocab_size: int,
        d_model: int = 256,
        nhead: int = 8,
        num_layers: int = 4,
        dropout: float = 0.1,
        max_len: int = 5000,
        **kwargs
    ):
        super().__init__()
        self.d_model = d_model

        # Token embedding + positional encoding
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_enc = PositionalEncoding(d_model, dropout, max_len)

        # Encoder and decoder stacks
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            batch_first=True,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            batch_first=True,
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        # Final linear projection
        self.out = nn.Linear(d_model, vocab_size)

    def forward(
        self,
        src: torch.Tensor,
        src_lens,
        tgt: torch.Tensor,
    ) -> torch.Tensor:
        """
        Args:
            src: (batch_size, src_seq_len)
            src_lens: (unused here, but kept for compatibility)
            tgt: (batch_size, tgt_seq_len)
        Returns:
            logits: (batch_size, tgt_seq_len, vocab_size)
        """
        # Padding masks
        src_key_padding_mask = src == 0  # True at padding positions
        tgt_key_padding_mask = tgt == 0

        # Embedding + scaling + positional encoding
        emb_src = self.embedding(src) * math.sqrt(self.d_model)
        emb_src = self.pos_enc(emb_src)

        emb_tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        emb_tgt = self.pos_enc(emb_tgt)

        # Encoder forward
        memory = self.encoder(
            emb_src,
            src_key_padding_mask=src_key_padding_mask,
        )

        # Create causal mask for decoder (prevent attending to future tokens)
        tgt_seq_len = tgt.size(1)
        causal_mask = nn.Transformer.generate_square_subsequent_mask(tgt_seq_len).to(
            src.device
        )

        causal_mask = causal_mask.to(torch.bool)
        output = self.decoder(
            emb_tgt,
            memory,
            tgt_mask=causal_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=src_key_padding_mask,
        )

        # Project to vocabulary
        logits = self.out(output)
        return logits


In [None]:
# Data loading & utilities

def set_seed(seed: int) -> None:
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False


class Timer:
  """Context manager that measures (wall-clock) seconds."""

  def __enter__(self):
    self.start = time.time()
    return self

  def __exit__(self, exc_type, exc_val, exc_tb):
    self.end = time.time()
    self.elapsed = self.end - self.start


BOS, EOS, PAD, UNK = "<s>", "</s>", "<pad>", "<unk>"


def download_iwslt17_de_en(data_dir: Path) -> Tuple[Path, Path, Path]:
  dataset = load_dataset("iwslt2017", "iwslt2017-de-en")
  splits = {}
  for split in ("train", "validation", "test"):
    lines = [
      f"{ex['translation']['de']}\t{ex['translation']['en']}"
      for ex in dataset[split]
    ]
    out_path = data_dir / f"{split}.tsv"
    out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
    splits[split] = out_path
  return splits["train"], splits["validation"], splits["test"]


def train_sentencepiece(input_paths: List[Path], model_prefix: str, vocab_size: int = 8000) -> Path:
  input_text = "\n".join([p.read_text(encoding="utf-8") for p in input_paths])
  tmp = Path(f"{model_prefix}_corpus.txt")
  tmp.write_text(input_text, encoding="utf-8")
  spm.SentencePieceTrainer.train(
    input=str(tmp), model_prefix=model_prefix, vocab_size=vocab_size,
    character_coverage=1.0, model_type="bpe",
    pad_id=0, unk_id=1, bos_id=2, eos_id=3, user_defined_symbols=""  # PAD, UNK, BOS, EOS
  )
  tmp.unlink()  # cleanup
  return Path(f"{model_prefix}.model")


def encode_file(sp: spm.SentencePieceProcessor, in_path: Path, out_path: Path) -> None:
  with in_path.open("r", encoding="utf-8") as fi, out_path.open("w", encoding="utf-8") as fo:
    for line in fi:
      src, tgt = line.rstrip().split("\t")
      pieces_src = sp.encode(src, out_type=str)
      pieces_tgt = sp.encode(tgt, out_type=str)
      fo.write(" ".join(pieces_src) + "\t" + " ".join(pieces_tgt) + "\n")


class ParallelDataset(tud.Dataset):
    def __init__(self, path: Path, sp: spm.SentencePieceProcessor, max_len: int = 100):
        self.samples = []
        BOS_ID, EOS_ID = sp.bos_id(), sp.eos_id()

        with path.open("r", encoding="utf-8") as fh:
            for ln in fh:
                if "\t" not in ln:
                    continue
                src_txt, tgt_txt = ln.rstrip().split("\t", maxsplit=1)

                # Tokens already split, just convert to IDs directly
                src_ids = [BOS_ID] + sp.piece_to_id(src_txt.split()) + [EOS_ID]
                tgt_ids = [BOS_ID] + sp.piece_to_id(tgt_txt.split()) + [EOS_ID]

                if len(src_ids) <= max_len and len(tgt_ids) <= max_len:
                    self.samples.append(
                        (torch.LongTensor(src_ids), torch.LongTensor(tgt_ids))
                    )

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


def collate_fn(batch):
  srcs, tgts = zip(*batch)
  src_lens = [len(x) for x in srcs]
  tgt_lens = [len(x) for x in tgts]
  max_src, max_tgt = max(src_lens), max(tgt_lens)
  src_pad = torch.zeros(len(batch), max_src, dtype=torch.long)
  tgt_pad = torch.zeros(len(batch), max_tgt, dtype=torch.long)
  for i, (src, tgt) in enumerate(zip(srcs, tgts)):
    src_pad[i, : len(src)] = src
    tgt_pad[i, : len(tgt)] = tgt
  return src_pad, torch.tensor(src_lens), tgt_pad, torch.tensor(tgt_lens)


def get_noam_scheduler(optimizer, d_model, warmup_steps, lr_scale=1.0):
    def lr_lambda(step):
        t = step + 1  
        scale = d_model ** -0.5
        return lr_scale * scale * min(t**-0.5, t * warmup_steps**-1.5)
    return LambdaLR(optimizer, lr_lambda)

In [4]:
# Training & Evaluation

def label_smoothing_loss(logits, targets, pad_idx: int = 0, smoothing: float = 0.1):
  """
  Cross-entropy with uniform label smoothing.
  Args
      logits   : (B, T, V) - raw scores from the model
      targets  : (B, T)    - ground-truth token IDs
  """
  vocab = logits.size(-1)

  
  logits_flat  = logits.contiguous().view(-1, vocab)   # (B*T, V)
  targets_flat = targets.contiguous().view(-1)         # (B*T)

  # Standard CE per token
  nll = torch.nn.functional.cross_entropy(
    logits_flat,
    targets_flat,
    ignore_index=pad_idx,
    reduction="none",
  )

  # Apply smoothing
  loss = (1.0 - smoothing) * nll + smoothing / vocab

  # Remove padding positions
  loss = loss[targets_flat != pad_idx]

  return loss.mean()


def train_epoch(model, iterator, optimizer, device, scheduler, clip_norm=1.0):
  model.train()
  total_loss = 0.0

  for src, src_lens, tgt, _ in tqdm(
      iterator, desc="Train batches", leave=False
  ):
    src, src_lens = src.to(device), src_lens.to(device)
    tgt_inp, tgt_out = tgt[:, :-1].to(device), tgt[:, 1:].to(device)

    optimizer.zero_grad()
    logits = model(src, src_lens, tgt_inp)
    loss = label_smoothing_loss(logits, tgt_out)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
    optimizer.step()
    scheduler.step()

    total_loss += loss.item() * src.size(0)

  return total_loss / len(iterator.dataset)


def greedy_translate_ids(model, sp, src, src_lens, device, max_len: int = 60):
  """
  src      : (B, S) padded batch on CPU or GPU
  src_lens : (B,)   true lengths
  returns  : List[List[int]] – token-id sequences (B of them)
  """
  BOS, EOS = sp.bos_id(), sp.eos_id()
  model.eval()

  with torch.no_grad():
    src, src_lens = src.to(device), src_lens.to(device)
    B = src.size(0)

    tgt = torch.full((B, 1), BOS, dtype=torch.long, device=device)
    finished = torch.zeros(B, dtype=torch.bool, device=device)

    for _ in range(max_len):
      logits   = model(src, src_lens, tgt) # (B, T, V)
      next_tok = logits[:, -1].argmax(-1, keepdim=True)

      tgt = torch.cat([tgt, next_tok], dim=1)
      finished |= (next_tok.squeeze(1) == EOS)
      if finished.all():
        break

    # strip BOS/EOS and move to Python lists
    out = []
    for row in tgt.tolist():
      ids = row[1:]
      if EOS in ids:
        ids = ids[: ids.index(EOS)]
      out.append(ids)
    return out

def beam_translate_ids(model, sp, src, src_lens, device, max_len: int = 60, beam_width: int = 4):
    """
    Batched and efficient beam search implementation (without separate encode/decode methods).
    """
    BOS, EOS = sp.bos_id(), sp.eos_id()
    model.eval()

    with torch.no_grad():
        src, src_lens = src.to(device), src_lens.to(device)
        B = src.size(0)

        # Repeat source inputs for beam search
        src = src.repeat_interleave(beam_width, dim=0)
        src_lens = src_lens.repeat_interleave(beam_width, dim=0)

        # Initialize target tokens with BOS
        tgt = torch.full((B * beam_width, 1), BOS, dtype=torch.long, device=device)
        beam_scores = torch.zeros(B, beam_width, device=device)
        beam_scores[:, 1:] = -1e9  # Initially deactivate all beams except first
        beam_scores = beam_scores.view(-1)

        finished = torch.zeros(B * beam_width, dtype=torch.bool, device=device)

        for _ in range(max_len):
            logits = model(src, src_lens, tgt)  # (B*beam_width, T, V)
            log_probs = F.log_softmax(logits[:, -1, :], dim=-1)  # (B*beam_width, V)

            scores = beam_scores.unsqueeze(1) + log_probs  # (B*beam_width, V)
            scores = scores.view(B, -1)  # (B, beam_width*V)

            top_scores, top_ids = scores.topk(beam_width, dim=-1)  # (B, beam_width)

            beam_indices = top_ids // log_probs.size(-1)
            token_indices = top_ids % log_probs.size(-1)

            # Reorder beams
            tgt = tgt.view(B, beam_width, -1)
            next_tgt = []
            for batch_idx in range(B):
                next_tgt.append(tgt[batch_idx, beam_indices[batch_idx]])
            tgt = torch.stack(next_tgt, dim=0).view(B * beam_width, -1)

            # Append tokens
            tgt = torch.cat([tgt, token_indices.view(-1, 1)], dim=-1)

            beam_scores = top_scores.view(-1)

            # Check EOS
            finished |= (token_indices.view(-1) == EOS)
            if finished.view(B, beam_width).all(dim=1).all():
                break

        # Choose best beams
        tgt = tgt.view(B, beam_width, -1)
        best_seqs = tgt[torch.arange(B), beam_scores.view(B, beam_width).argmax(dim=-1)]

        out = []
        for seq in best_seqs.tolist():
            if EOS in seq:
                seq = seq[1:seq.index(EOS)]
            else:
                seq = seq[1:]
            out.append(seq)

        return out


def evaluate(model, data_iter, sp, device):
  hyps, refs = [], []
  for src, src_lens, tgt, tgt_lens in tqdm(data_iter, desc="Evaluate", leave=False):
    # batched generation
    pred_ids = beam_translate_ids(model, sp, src, src_lens, device)

    # batched decoding
    hyps.extend([sp.decode(ids) for ids in pred_ids])

    # strip BOS/EOS then batch-decode
    ref_ids = [ t[1:l-1].tolist() for t, l in zip(tgt, tgt_lens) ]
    refs.extend([sp.decode(ids) for ids in ref_ids])


  assert len(hyps) == len(refs), "Mismatch between #hypotheses and #references!"

  bleu = sacrebleu.corpus_bleu(hyps, [refs])
  chrf = sacrebleu.corpus_chrf(hyps, [refs])
  return bleu.score, chrf.score



In [5]:
sizes = [10_000, 50_000, 75_000, 100_000, 150_000, 200_000]
trials_per_size = {10_000: 15, 50_000: 20, 75_000: 20, 100_000: 25, 150_000: 25, 200_000: 25}
epochs_per_size = {10_000: 4, 50_000: 5, 75_000: 5, 100_000: 8, 150_000: 10, 200_000: 10}

current_size = None

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
data_dir = Path("data")
data_dir.mkdir(parents=True, exist_ok=True)

tune_dir = Path("tune")
tune_dir.mkdir(parents=True, exist_ok=True)

train_dir = Path("train")
train_dir.mkdir(parents=True, exist_ok=True)

log_dir = Path("logs")
log_dir.mkdir(parents=True, exist_ok=True)

print("Downloading IWSLT-2017...")
train_raw, valid_raw, test_raw = download_iwslt17_de_en(data_dir)

print("Training SentencePiece model...")
spm_path = train_sentencepiece([train_raw], str(data_dir / "bpe8k"), vocab_size=8000)
sp = spm.SentencePieceProcessor(model_file=str(spm_path))

# Pre-encode full corpus once to speed up later sampling
print("Encoding full corpus... (this may take a minute)")
encoded_train = data_dir / "train.bpe.tsv"
encode_file(sp, train_raw, encoded_train)
encode_file(sp, valid_raw, data_dir / "valid.bpe.tsv")
encode_file(sp, test_raw, data_dir / "test.bpe.tsv")

for size in sizes:
    # Down-sample deterministically for reproducibility
    pairs = encoded_train.read_text().splitlines()
    random.Random(42).shuffle(pairs)
    subset_path = data_dir / f"train_{size}.bpe.tsv"
    subset_path.write_text("\n".join(pairs[: size]) + "\n", encoding="utf-8")

Downloading IWSLT-2014...
Training SentencePiece model...


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: data/bpe8k_corpus.txt
  input_format: 
  model_prefix: data/bpe8k
  model_type: BPE
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  diff

Encoding full corpus... (this may take a minute)


In [None]:
def suggest_bilstm_params(trial: optuna.Trial) -> dict:
    """
    Define a compact search space for a Bi-LSTM MT model with fixed batch size 2048,
    Adam optimizer, and StepLR scheduler.
    """
    return {
        # Architecture parameters
        "emb_size": trial.suggest_int("emb_size", 128, 512, step=64),
        "hidden_size": trial.suggest_int("hidden_size", 256, 1024, step=128),
        "num_layers": trial.suggest_int("num_layers", 1, 3),
        "dropout": trial.suggest_float("dropout", 0.1, 0.3),

        # optimizer parameters
        "lr": trial.suggest_float("lr", 3e-4, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True),
        "clip_norm": trial.suggest_float("clip_norm", 0.1, 1.0),
        "eps": trial.suggest_float("eps", 1e-9, 1e-6, log=True),
        "beta1": trial.suggest_float("beta1", 0.8, 0.99, step=0.01),
        "beta2": trial.suggest_float("beta2", 0.9, 0.999, step=0.001),

        # scheduler parameters
        "scheduler_step_size": trial.suggest_int("scheduler_step_size", 10, 40),
        "scheduler_gamma": trial.suggest_float("scheduler_gamma", 0.7, 0.9),
    }


def suggest_transformer_params(trial: optuna.Trial) -> dict:
    """
    Define a compact search space for a Transformer MT model with fixed batch size 2048,
    Adam optimizer, and StepLR scheduler.
    """
    return {
        # Architecture parameters
        "d_model": trial.suggest_int("d_model", 256, 512, step=128),
        "nhead": trial.suggest_categorical("nhead", [4, 8]),
        "num_layers": trial.suggest_int("num_layers", 2, 4),
        "dropout": trial.suggest_float("dropout", 0.1, 0.3),

        # optimizer parameters
        "lr": 1.0,
        "lr_scale": trial.suggest_float("lr_scale", 0.2, 2.0, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True),
        "clip_norm": trial.suggest_float("clip_norm", 0.1, 1.0),
        "eps": trial.suggest_float("eps", 1e-9, 1e-6, log=True),
        "beta1": trial.suggest_float("beta1", 0.8, 0.99, step=0.01),
        "beta2": trial.suggest_float("beta2", 0.9, 0.999, step=0.001),

        "warmup_steps": trial.suggest_int("warmup_steps", 400, 800, step=50)
    }


def make_objective(model_class, train_iter, valid_iter):
    def objective(trial):
        global current_size, device
        is_bilstm = (model_class.__name__ == 'BiLSTMTranslator')
        params = suggest_bilstm_params(trial) if is_bilstm else suggest_transformer_params(trial)

        # Build model
        if is_bilstm:
            model = model_class(8000,
                                hidden_size=params['hidden_size'],
                                num_layers=params['num_layers'],
                                dropout=params['dropout'])
        else:
            model = model_class(8000,
                                d_model=params['d_model'],
                                nhead=params['nhead'],
                                num_layers=params['num_layers'],
                                dropout=params['dropout'])

        model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'], eps=params['eps'], betas=(params['beta1'], params['beta2']))
        if is_bilstm:
            scheduler = StepLR(optimizer, step_size=params['scheduler_step_size'], gamma=params['scheduler_gamma'])
        else:
            scheduler = get_noam_scheduler(optimizer, params['d_model'], warmup_steps=params['warmup_steps'], lr_scale=params['lr_scale'])

        max_epochs = epochs_per_size[current_size]
        for epoch in range(1, max_epochs + 1):
            train_epoch(model, train_iter, optimizer, device, scheduler, clip_norm=params['clip_norm'])
            bleu, _ = evaluate(model, valid_iter, sp, device)
            trial.report(bleu, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()
        return bleu
    return objective

pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
best_params = {}

for model_class in [BiLSTMTranslator]:
# for model_class in [TransformerTranslator]:
    model_name = model_class.__name__
    prev_params = None
    print(f"\nTuning {model_name} across dataset sizes...")

    batch_size = 2048 if model_class == BiLSTMTranslator else 1024
    max_steps = 2000 if model_class == BiLSTMTranslator else 4000
    print(f"Using\t Batch size (train & tune): {batch_size}\t Max steps (train): {max_steps}")

    for size in sizes:
        current_size = size
        print(f"\nDataset size: {size}")

        # Load data slice
        train_ds = ParallelDataset(data_dir / f"train_{size}.bpe.tsv", sp)
        valid_ds = ParallelDataset(data_dir / "valid.bpe.tsv", sp)
        test_ds = ParallelDataset(data_dir / "test.bpe.tsv", sp)

        train_iter = tud.DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        valid_iter = tud.DataLoader(valid_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
        test_iter = tud.DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

        # Create or reuse study
        study_name = f"{model_name}_{size}"
        study = optuna.create_study(storage=f'sqlite:///{tune_dir / f"{study_name}.db"}', direction='maximize', pruner=pruner, study_name=study_name, load_if_exists=False)

        # Warm-start with previous size's best
        if prev_params:
            study.enqueue_trial(prev_params)

        # Optimize
        study.optimize(make_objective(model_class, train_iter, valid_iter), n_trials=trials_per_size[size], gc_after_trial=True, show_progress_bar=True)

        # Record and carry forward
        best = study.best_params
        best_params[size] = best
        prev_params = best
        # write best params to file
        with open(tune_dir / f"{model_name}_{size}_best.json", "w") as f:
            json.dump(best, f, indent=2)

        print(f"Tuning done for {model_name} @ {size}!")
        print(f"Now training best model")

        model = model_class(8000, **best)
        model.to(device)

        optimizer = optim.Adam(model.parameters(), lr=best.get('lr', 1.0), weight_decay=best['weight_decay'], eps=best['eps'], betas=(best['beta1'], best['beta2']))

        if model_class == TransformerTranslator:
            scheduler = get_noam_scheduler(optimizer, best['d_model'], warmup_steps=best['warmup_steps'], lr_scale=best['lr_scale'])
        else:
            scheduler = StepLR(optimizer, step_size=best['scheduler_step_size'], gamma=best['scheduler_gamma'])

        step = 0
        best_bleu = 0.0
        start_time = time.time()
        p_bar = tqdm(leave=False, dynamic_ncols=True, desc="Training", unit="it", total=max_steps)
        i = 0
        write = SummaryWriter(log_dir / f"{model_name}_{size}")
        cfg_name = f"{model_name}_{size}"
        with Timer() as run_timer:
            while step < max_steps and (time.time() - start_time) < 3600:
                epoch_loss = train_epoch(model, train_iter, optimizer, device, scheduler, clip_norm=best['clip_norm'])
                step += len(train_iter)
                i += 1
                p_bar.set_postfix(loss=epoch_loss)
                p_bar.update(len(train_iter))
                bleu, chrf = evaluate(model, valid_iter, sp, device)
                if bleu > best_bleu:
                    best_bleu = bleu
                    torch.save(model.state_dict(), train_dir / f"{cfg_name}_best.pt")
                    with open(train_dir / f"{cfg_name}_best_num_steps.txt", "w") as f:
                        f.write(str(step))
                write.add_scalar("loss/train", epoch_loss, step)
                write.add_scalar("bleu/valid", bleu, step)
                write.add_scalar("chrf/valid", chrf, step)
                print(f"[{cfg_name}] step={step} loss={epoch_loss:.3f} BLEU={bleu:.2f} ChrF={chrf:.2f}")
        
        write.add_hparams(best, {"bleu": best_bleu, "chrf": chrf, "steps": step, "time": run_timer.elapsed})
        write.close()
        torch.save(model.state_dict(), train_dir / f"{cfg_name}_final.pt")
        print(f"Training complete for {model_name} @ {size} in {timedelta(seconds=run_timer.elapsed)}")
        print(f"Best BLEU: {best_bleu:.2f}, ChrF: {chrf:.2f}")
        p_bar.close()


Tuning BiLSTMTranslator across dataset sizes...
Using	 Batch size (train & tune): 2048	 Max steps (train): 2000

Dataset size: 10000


[I 2025-07-30 03:17:18,497] A new study created in RDB with name: BiLSTMTranslator_10000


  0%|          | 0/15 [00:00<?, ?it/s]



Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:18:07,632] Trial 0 finished with value: 0.021840010088859624 and parameters: {'emb_size': 320, 'hidden_size': 1024, 'num_layers': 1, 'dropout': 0.1840021624878843, 'lr': 0.0013590908209991576, 'weight_decay': 5.022694759628134e-06, 'clip_norm': 0.21171324939998332, 'eps': 6.853260340947576e-09, 'beta1': 0.98, 'beta2': 0.9530000000000001, 'scheduler_step_size': 22, 'scheduler_gamma': 0.8797479337132964}. Best is trial 0 with value: 0.021840010088859624.


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:18:38,270] Trial 1 finished with value: 0.00883433710292348 and parameters: {'emb_size': 256, 'hidden_size': 384, 'num_layers': 2, 'dropout': 0.176826528607991, 'lr': 0.0009348953604402773, 'weight_decay': 1.4548307835798548e-06, 'clip_norm': 0.7521646800488011, 'eps': 1.6350471521897794e-08, 'beta1': 0.89, 'beta2': 0.935, 'scheduler_step_size': 10, 'scheduler_gamma': 0.8153773101550539}. Best is trial 0 with value: 0.021840010088859624.




Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:19:28,429] Trial 2 finished with value: 0.005402494053223377 and parameters: {'emb_size': 128, 'hidden_size': 1024, 'num_layers': 1, 'dropout': 0.2373372685667836, 'lr': 0.0006006969988583888, 'weight_decay': 1.860616436794703e-06, 'clip_norm': 0.3971392259411366, 'eps': 1.3008728977338694e-09, 'beta1': 0.93, 'beta2': 0.931, 'scheduler_step_size': 40, 'scheduler_gamma': 0.7397132894319234}. Best is trial 0 with value: 0.021840010088859624.


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:20:10,893] Trial 3 finished with value: 0.07374113135524744 and parameters: {'emb_size': 512, 'hidden_size': 640, 'num_layers': 2, 'dropout': 0.17913242300807822, 'lr': 0.00436147419920961, 'weight_decay': 1.8042018279721198e-06, 'clip_norm': 0.9517900527441999, 'eps': 1.3928271755794772e-07, 'beta1': 0.81, 'beta2': 0.9380000000000001, 'scheduler_step_size': 15, 'scheduler_gamma': 0.8055308313180123}. Best is trial 3 with value: 0.07374113135524744.


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:20:46,599] Trial 4 finished with value: 0.0048588405842435595 and parameters: {'emb_size': 192, 'hidden_size': 256, 'num_layers': 2, 'dropout': 0.1328835943706632, 'lr': 0.0013864980679508684, 'weight_decay': 4.6157497580412744e-05, 'clip_norm': 0.3667490755645847, 'eps': 1.3391933928878336e-08, 'beta1': 0.9400000000000001, 'beta2': 0.985, 'scheduler_step_size': 18, 'scheduler_gamma': 0.8849591493491784}. Best is trial 3 with value: 0.07374113135524744.


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:21:28,694] Trial 5 finished with value: 0.11219514041507703 and parameters: {'emb_size': 128, 'hidden_size': 896, 'num_layers': 2, 'dropout': 0.22442291966760786, 'lr': 0.0020135017456745963, 'weight_decay': 3.2407737506391444e-05, 'clip_norm': 0.9210656775828967, 'eps': 2.1838763463866785e-07, 'beta1': 0.81, 'beta2': 0.9390000000000001, 'scheduler_step_size': 27, 'scheduler_gamma': 0.8297686434026912}. Best is trial 5 with value: 0.11219514041507703.


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:22:15,395] Trial 6 finished with value: 0.33607565901197933 and parameters: {'emb_size': 128, 'hidden_size': 1024, 'num_layers': 2, 'dropout': 0.20140629772740112, 'lr': 0.0046855351300966195, 'weight_decay': 5.289911634326265e-06, 'clip_norm': 0.4474853111166285, 'eps': 9.265795219485713e-09, 'beta1': 0.81, 'beta2': 0.9380000000000001, 'scheduler_step_size': 33, 'scheduler_gamma': 0.7272478542866786}. Best is trial 6 with value: 0.33607565901197933.


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:22:35,405] Trial 7 pruned. 


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:22:58,735] Trial 8 pruned. 


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:23:16,261] Trial 9 pruned. 


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:24:01,748] Trial 10 finished with value: 0.058591621123156475 and parameters: {'emb_size': 192, 'hidden_size': 768, 'num_layers': 3, 'dropout': 0.10161505590889597, 'lr': 0.004427973940228536, 'weight_decay': 6.059547813491682e-06, 'clip_norm': 0.6286362868208566, 'eps': 5.435281688483443e-08, 'beta1': 0.8500000000000001, 'beta2': 0.907, 'scheduler_step_size': 34, 'scheduler_gamma': 0.7133311041950341}. Best is trial 6 with value: 0.33607565901197933.




Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:24:25,084] Trial 11 pruned. 


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:25:05,721] Trial 12 pruned. 


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:25:53,655] Trial 13 finished with value: 0.1463137162067473 and parameters: {'emb_size': 192, 'hidden_size': 1024, 'num_layers': 2, 'dropout': 0.2787279104564398, 'lr': 0.002305229002583193, 'weight_decay': 2.6890948879126627e-05, 'clip_norm': 0.1636326001180904, 'eps': 4.8783859825062124e-08, 'beta1': 0.8300000000000001, 'beta2': 0.9690000000000001, 'scheduler_step_size': 35, 'scheduler_gamma': 0.7708342281432701}. Best is trial 6 with value: 0.33607565901197933.


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 03:26:48,176] Trial 14 finished with value: 0.3915318207756864 and parameters: {'emb_size': 256, 'hidden_size': 1024, 'num_layers': 2, 'dropout': 0.2997458790047909, 'lr': 0.003283103744782199, 'weight_decay': 3.559253707815781e-06, 'clip_norm': 0.1751899265135654, 'eps': 5.3617868959232746e-08, 'beta1': 0.88, 'beta2': 0.97, 'scheduler_step_size': 36, 'scheduler_gamma': 0.7658951941534441}. Best is trial 14 with value: 0.3915318207756864.
Tuning done for BiLSTMTranslator @ 10000!
Now training best model


Training:   0%|          | 0/2000 [00:00<?, ?it/s]

Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=5 loss=6.839 BLEU=0.00 ChrF=4.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=10 loss=5.688 BLEU=0.03 ChrF=3.06


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=15 loss=5.455 BLEU=0.03 ChrF=4.28


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=20 loss=5.264 BLEU=0.12 ChrF=7.67


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=25 loss=5.066 BLEU=0.39 ChrF=10.48


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=30 loss=4.882 BLEU=0.59 ChrF=13.29


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=35 loss=4.723 BLEU=0.84 ChrF=13.52


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=40 loss=4.574 BLEU=0.78 ChrF=13.17


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=45 loss=4.450 BLEU=0.70 ChrF=12.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=50 loss=4.330 BLEU=0.84 ChrF=12.67


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=55 loss=4.205 BLEU=0.92 ChrF=13.81


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=60 loss=4.086 BLEU=0.94 ChrF=13.52


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=65 loss=3.965 BLEU=1.31 ChrF=15.19


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=70 loss=3.839 BLEU=1.10 ChrF=16.33


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=75 loss=3.708 BLEU=1.58 ChrF=16.07


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=80 loss=3.593 BLEU=1.77 ChrF=16.55


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=85 loss=3.487 BLEU=1.54 ChrF=16.40


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=90 loss=3.371 BLEU=1.81 ChrF=18.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=95 loss=3.262 BLEU=1.97 ChrF=17.80


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=100 loss=3.152 BLEU=2.04 ChrF=17.42


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=105 loss=3.055 BLEU=2.27 ChrF=18.35


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=110 loss=2.943 BLEU=2.45 ChrF=18.94


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=115 loss=2.841 BLEU=2.30 ChrF=19.51


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=120 loss=2.748 BLEU=2.18 ChrF=19.21


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=125 loss=2.656 BLEU=2.59 ChrF=19.52


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=130 loss=2.563 BLEU=2.38 ChrF=19.20


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=135 loss=2.480 BLEU=2.37 ChrF=19.36


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=140 loss=2.395 BLEU=2.37 ChrF=20.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=145 loss=2.312 BLEU=2.53 ChrF=19.61


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=150 loss=2.229 BLEU=2.56 ChrF=20.22


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=155 loss=2.162 BLEU=2.45 ChrF=20.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=160 loss=2.097 BLEU=2.53 ChrF=20.20


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=165 loss=2.031 BLEU=2.24 ChrF=20.57


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=170 loss=1.970 BLEU=2.60 ChrF=20.40


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=175 loss=1.909 BLEU=2.52 ChrF=20.81


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=180 loss=1.850 BLEU=2.51 ChrF=20.85


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=185 loss=1.787 BLEU=2.44 ChrF=20.59


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=190 loss=1.740 BLEU=2.51 ChrF=20.65


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=195 loss=1.693 BLEU=2.72 ChrF=20.96


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=200 loss=1.651 BLEU=2.56 ChrF=20.72


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=205 loss=1.606 BLEU=2.58 ChrF=20.85


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=210 loss=1.562 BLEU=2.67 ChrF=20.72


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=215 loss=1.520 BLEU=2.59 ChrF=20.82


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=220 loss=1.479 BLEU=2.69 ChrF=21.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=225 loss=1.443 BLEU=2.56 ChrF=21.28


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=230 loss=1.409 BLEU=2.63 ChrF=21.12


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=235 loss=1.377 BLEU=2.71 ChrF=21.26


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=240 loss=1.343 BLEU=2.53 ChrF=21.30


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=245 loss=1.312 BLEU=2.63 ChrF=21.56


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=250 loss=1.280 BLEU=2.83 ChrF=21.51


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=255 loss=1.250 BLEU=2.73 ChrF=21.47


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=260 loss=1.226 BLEU=2.78 ChrF=21.07


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=265 loss=1.204 BLEU=2.56 ChrF=21.16


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=270 loss=1.182 BLEU=2.74 ChrF=21.60


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=275 loss=1.163 BLEU=2.81 ChrF=21.61


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=280 loss=1.141 BLEU=2.79 ChrF=21.41


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=285 loss=1.124 BLEU=2.70 ChrF=21.24


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=290 loss=1.104 BLEU=2.77 ChrF=21.42


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=295 loss=1.083 BLEU=2.79 ChrF=21.47


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=300 loss=1.063 BLEU=2.90 ChrF=21.52


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=305 loss=1.046 BLEU=2.70 ChrF=21.61


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=310 loss=1.028 BLEU=2.81 ChrF=21.78


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=315 loss=1.014 BLEU=2.66 ChrF=21.38


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=320 loss=0.996 BLEU=2.65 ChrF=21.61


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=325 loss=0.978 BLEU=2.68 ChrF=21.55


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=330 loss=0.962 BLEU=2.75 ChrF=21.65


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=335 loss=0.948 BLEU=2.73 ChrF=21.40


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=340 loss=0.934 BLEU=2.76 ChrF=21.66


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=345 loss=0.923 BLEU=2.88 ChrF=21.61


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=350 loss=0.911 BLEU=2.87 ChrF=21.75


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=355 loss=0.900 BLEU=2.67 ChrF=21.52


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=360 loss=0.888 BLEU=2.65 ChrF=21.80


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=365 loss=0.876 BLEU=2.64 ChrF=21.67


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=370 loss=0.867 BLEU=2.82 ChrF=21.72


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=375 loss=0.857 BLEU=2.72 ChrF=21.61


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=380 loss=0.847 BLEU=2.74 ChrF=21.62


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=385 loss=0.838 BLEU=2.75 ChrF=21.75


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=390 loss=0.827 BLEU=2.81 ChrF=21.92


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=395 loss=0.817 BLEU=2.75 ChrF=21.80


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=400 loss=0.809 BLEU=2.77 ChrF=21.83


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=405 loss=0.801 BLEU=2.69 ChrF=21.78


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=410 loss=0.795 BLEU=2.79 ChrF=21.87


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=415 loss=0.788 BLEU=2.68 ChrF=21.68


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=420 loss=0.781 BLEU=2.80 ChrF=21.79


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=425 loss=0.775 BLEU=2.66 ChrF=21.80


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=430 loss=0.766 BLEU=2.75 ChrF=21.81


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=435 loss=0.762 BLEU=2.62 ChrF=21.87


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=440 loss=0.756 BLEU=2.72 ChrF=21.69


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=445 loss=0.751 BLEU=2.89 ChrF=21.70


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=450 loss=0.745 BLEU=2.75 ChrF=21.83


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=455 loss=0.739 BLEU=2.60 ChrF=21.66


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=460 loss=0.735 BLEU=2.72 ChrF=21.87


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=465 loss=0.730 BLEU=2.72 ChrF=21.74


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=470 loss=0.723 BLEU=2.64 ChrF=21.70


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=475 loss=0.720 BLEU=2.78 ChrF=21.75


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=480 loss=0.716 BLEU=2.52 ChrF=21.83


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=485 loss=0.713 BLEU=2.64 ChrF=21.79


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=490 loss=0.708 BLEU=2.81 ChrF=21.74


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=495 loss=0.704 BLEU=2.83 ChrF=21.79


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=500 loss=0.701 BLEU=2.71 ChrF=21.74


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=505 loss=0.698 BLEU=2.54 ChrF=21.67


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=510 loss=0.693 BLEU=2.70 ChrF=21.90


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=515 loss=0.690 BLEU=2.76 ChrF=21.92


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=520 loss=0.687 BLEU=2.74 ChrF=21.94


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=525 loss=0.686 BLEU=2.82 ChrF=21.90


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=530 loss=0.682 BLEU=2.83 ChrF=21.90


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=535 loss=0.680 BLEU=2.65 ChrF=21.81


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=540 loss=0.678 BLEU=2.63 ChrF=21.70


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=545 loss=0.674 BLEU=2.75 ChrF=21.89


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=550 loss=0.671 BLEU=2.67 ChrF=21.84


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=555 loss=0.669 BLEU=2.73 ChrF=21.80


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=560 loss=0.667 BLEU=2.63 ChrF=21.89


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=565 loss=0.665 BLEU=2.81 ChrF=21.89


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=570 loss=0.662 BLEU=2.62 ChrF=21.88


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=575 loss=0.661 BLEU=2.71 ChrF=21.94


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=580 loss=0.658 BLEU=2.76 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=585 loss=0.657 BLEU=2.71 ChrF=21.85


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=590 loss=0.655 BLEU=2.72 ChrF=21.91


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=595 loss=0.652 BLEU=2.63 ChrF=21.79


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=600 loss=0.651 BLEU=2.68 ChrF=21.89


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=605 loss=0.651 BLEU=2.67 ChrF=21.91


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=610 loss=0.649 BLEU=2.73 ChrF=21.85


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=615 loss=0.647 BLEU=2.73 ChrF=21.95


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=620 loss=0.645 BLEU=2.73 ChrF=22.00


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=625 loss=0.643 BLEU=2.66 ChrF=21.88


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=630 loss=0.643 BLEU=2.61 ChrF=21.87


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=635 loss=0.643 BLEU=2.56 ChrF=21.80


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=640 loss=0.640 BLEU=2.65 ChrF=21.95


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=645 loss=0.640 BLEU=2.65 ChrF=21.84


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=650 loss=0.637 BLEU=2.73 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=655 loss=0.636 BLEU=2.74 ChrF=21.88


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=660 loss=0.637 BLEU=2.69 ChrF=21.85


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=665 loss=0.635 BLEU=2.65 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=670 loss=0.633 BLEU=2.73 ChrF=21.95


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=675 loss=0.632 BLEU=2.65 ChrF=21.98


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=680 loss=0.632 BLEU=2.58 ChrF=21.90


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=685 loss=0.630 BLEU=2.72 ChrF=21.91


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=690 loss=0.630 BLEU=2.67 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=695 loss=0.630 BLEU=2.71 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=700 loss=0.627 BLEU=2.77 ChrF=21.96


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=705 loss=0.629 BLEU=2.72 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=710 loss=0.626 BLEU=2.71 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=715 loss=0.626 BLEU=2.75 ChrF=21.90


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=720 loss=0.627 BLEU=2.73 ChrF=21.98


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=725 loss=0.625 BLEU=2.64 ChrF=21.91


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=730 loss=0.623 BLEU=2.69 ChrF=21.84


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=735 loss=0.625 BLEU=2.79 ChrF=21.98


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=740 loss=0.623 BLEU=2.81 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=745 loss=0.622 BLEU=2.72 ChrF=21.96


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=750 loss=0.622 BLEU=2.81 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=755 loss=0.620 BLEU=2.87 ChrF=22.08


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=760 loss=0.621 BLEU=2.92 ChrF=22.08


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=765 loss=0.620 BLEU=2.83 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=770 loss=0.620 BLEU=2.93 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=775 loss=0.620 BLEU=2.90 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=780 loss=0.620 BLEU=2.78 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=785 loss=0.618 BLEU=2.77 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=790 loss=0.619 BLEU=2.87 ChrF=21.88


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=795 loss=0.618 BLEU=2.84 ChrF=21.94


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=800 loss=0.618 BLEU=2.85 ChrF=22.00


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=805 loss=0.618 BLEU=2.84 ChrF=22.00


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=810 loss=0.616 BLEU=2.78 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=815 loss=0.618 BLEU=2.80 ChrF=21.94


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=820 loss=0.617 BLEU=2.85 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=825 loss=0.615 BLEU=2.82 ChrF=22.07


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=830 loss=0.616 BLEU=2.84 ChrF=22.06


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=835 loss=0.616 BLEU=2.81 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=840 loss=0.613 BLEU=2.77 ChrF=21.91


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=845 loss=0.615 BLEU=2.88 ChrF=22.00


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=850 loss=0.613 BLEU=2.89 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=855 loss=0.613 BLEU=2.79 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=860 loss=0.614 BLEU=2.81 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=865 loss=0.614 BLEU=2.83 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=870 loss=0.613 BLEU=2.86 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=875 loss=0.613 BLEU=2.80 ChrF=21.93


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=880 loss=0.612 BLEU=2.82 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=885 loss=0.612 BLEU=2.81 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=890 loss=0.614 BLEU=2.82 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=895 loss=0.612 BLEU=2.79 ChrF=21.98


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=900 loss=0.612 BLEU=2.71 ChrF=21.92


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=905 loss=0.613 BLEU=2.69 ChrF=21.95


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=910 loss=0.612 BLEU=2.70 ChrF=21.95


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=915 loss=0.612 BLEU=2.75 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=920 loss=0.612 BLEU=2.74 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=925 loss=0.612 BLEU=2.74 ChrF=21.97


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=930 loss=0.611 BLEU=2.75 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=935 loss=0.611 BLEU=2.72 ChrF=21.98


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=940 loss=0.610 BLEU=2.71 ChrF=22.00


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=945 loss=0.610 BLEU=2.72 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=950 loss=0.611 BLEU=2.74 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=955 loss=0.611 BLEU=2.74 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=960 loss=0.611 BLEU=2.74 ChrF=22.08


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=965 loss=0.610 BLEU=2.75 ChrF=22.07


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=970 loss=0.610 BLEU=2.80 ChrF=22.08


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=975 loss=0.610 BLEU=2.81 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=980 loss=0.610 BLEU=2.77 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=985 loss=0.611 BLEU=2.80 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=990 loss=0.611 BLEU=2.80 ChrF=22.08


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=995 loss=0.608 BLEU=2.80 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1000 loss=0.610 BLEU=2.79 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1005 loss=0.610 BLEU=2.79 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1010 loss=0.609 BLEU=2.80 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1015 loss=0.609 BLEU=2.80 ChrF=22.00


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1020 loss=0.610 BLEU=2.80 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1025 loss=0.610 BLEU=2.82 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1030 loss=0.608 BLEU=2.82 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1035 loss=0.609 BLEU=2.82 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1040 loss=0.610 BLEU=2.76 ChrF=22.00


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1045 loss=0.610 BLEU=2.74 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1050 loss=0.610 BLEU=2.72 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1055 loss=0.610 BLEU=2.75 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1060 loss=0.610 BLEU=2.73 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1065 loss=0.609 BLEU=2.74 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1070 loss=0.608 BLEU=2.79 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1075 loss=0.610 BLEU=2.81 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1080 loss=0.608 BLEU=2.76 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1085 loss=0.609 BLEU=2.76 ChrF=22.00


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1090 loss=0.609 BLEU=2.76 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1095 loss=0.609 BLEU=2.75 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1100 loss=0.608 BLEU=2.75 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1105 loss=0.608 BLEU=2.76 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1110 loss=0.609 BLEU=2.76 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1115 loss=0.610 BLEU=2.76 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1120 loss=0.609 BLEU=2.76 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1125 loss=0.608 BLEU=2.75 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1130 loss=0.609 BLEU=2.76 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1135 loss=0.608 BLEU=2.76 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1140 loss=0.608 BLEU=2.74 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1145 loss=0.608 BLEU=2.75 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1150 loss=0.608 BLEU=2.75 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1155 loss=0.607 BLEU=2.75 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1160 loss=0.607 BLEU=2.75 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1165 loss=0.608 BLEU=2.73 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1170 loss=0.608 BLEU=2.76 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1175 loss=0.609 BLEU=2.76 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1180 loss=0.608 BLEU=2.76 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1185 loss=0.609 BLEU=2.75 ChrF=22.06


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1190 loss=0.607 BLEU=2.75 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1195 loss=0.608 BLEU=2.75 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1200 loss=0.608 BLEU=2.74 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1205 loss=0.609 BLEU=2.74 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1210 loss=0.608 BLEU=2.75 ChrF=21.99


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1215 loss=0.608 BLEU=2.75 ChrF=21.98


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1220 loss=0.607 BLEU=2.81 ChrF=22.00


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1225 loss=0.607 BLEU=2.81 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1230 loss=0.608 BLEU=2.82 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1235 loss=0.608 BLEU=2.82 ChrF=22.07


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1240 loss=0.608 BLEU=2.82 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1245 loss=0.607 BLEU=2.82 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1250 loss=0.609 BLEU=2.81 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1255 loss=0.608 BLEU=2.84 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1260 loss=0.608 BLEU=2.83 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1265 loss=0.608 BLEU=2.84 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1270 loss=0.607 BLEU=2.84 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1275 loss=0.608 BLEU=2.84 ChrF=22.05


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1280 loss=0.608 BLEU=2.84 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1285 loss=0.607 BLEU=2.84 ChrF=22.04


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1290 loss=0.608 BLEU=2.77 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1295 loss=0.607 BLEU=2.83 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1300 loss=0.607 BLEU=2.83 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1305 loss=0.609 BLEU=2.77 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1310 loss=0.607 BLEU=2.77 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1315 loss=0.610 BLEU=2.77 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1320 loss=0.607 BLEU=2.77 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1325 loss=0.608 BLEU=2.77 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1330 loss=0.607 BLEU=2.77 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1335 loss=0.607 BLEU=2.77 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1340 loss=0.608 BLEU=2.77 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1345 loss=0.608 BLEU=2.77 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1350 loss=0.607 BLEU=2.77 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1355 loss=0.608 BLEU=2.77 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1360 loss=0.609 BLEU=2.78 ChrF=22.03


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1365 loss=0.609 BLEU=2.77 ChrF=22.02


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1370 loss=0.609 BLEU=2.77 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1375 loss=0.608 BLEU=2.77 ChrF=22.01


Train batches:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_10000] step=1380 loss=0.607 BLEU=2.78 ChrF=22.02
Training complete for BiLSTMTranslator @ 10000 in 1:00:09.081581
Best BLEU: 2.93, ChrF: 22.02

Dataset size: 50000


[I 2025-07-30 04:27:00,159] A new study created in RDB with name: BiLSTMTranslator_50000


  0%|          | 0/20 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:29:35,450] Trial 0 finished with value: 4.13484680650554 and parameters: {'emb_size': 256, 'hidden_size': 1024, 'num_layers': 2, 'dropout': 0.2997458790047909, 'lr': 0.003283103744782199, 'weight_decay': 3.559253707815781e-06, 'clip_norm': 0.1751899265135654, 'eps': 5.3617868959232746e-08, 'beta1': 0.88, 'beta2': 0.97, 'scheduler_step_size': 36, 'scheduler_gamma': 0.7658951941534441}. Best is trial 0 with value: 4.13484680650554.


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:31:29,566] Trial 1 finished with value: 0.08357922860391781 and parameters: {'emb_size': 128, 'hidden_size': 512, 'num_layers': 3, 'dropout': 0.16355276388512302, 'lr': 0.0009356712916552662, 'weight_decay': 5.451804075093817e-05, 'clip_norm': 0.7852385657600478, 'eps': 1.670338781596552e-08, 'beta1': 0.8400000000000001, 'beta2': 0.97, 'scheduler_step_size': 12, 'scheduler_gamma': 0.8270746250710269}. Best is trial 0 with value: 4.13484680650554.


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:34:18,721] Trial 2 finished with value: 1.4172729605455685 and parameters: {'emb_size': 448, 'hidden_size': 1024, 'num_layers': 3, 'dropout': 0.18827591259002613, 'lr': 0.002846833659703249, 'weight_decay': 4.8257324683136324e-06, 'clip_norm': 0.2596188411885035, 'eps': 2.4130265891754445e-07, 'beta1': 0.8200000000000001, 'beta2': 0.973, 'scheduler_step_size': 21, 'scheduler_gamma': 0.8976584934832944}. Best is trial 0 with value: 4.13484680650554.


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:38:33,627] Trial 4 finished with value: 0.19404227996237067 and parameters: {'emb_size': 320, 'hidden_size': 640, 'num_layers': 2, 'dropout': 0.2677138558261009, 'lr': 0.0005078977544210152, 'weight_decay': 9.928047977407783e-06, 'clip_norm': 0.27899429465574777, 'eps': 2.748310305049888e-09, 'beta1': 0.9, 'beta2': 0.934, 'scheduler_step_size': 25, 'scheduler_gamma': 0.7893114832062307}. Best is trial 0 with value: 4.13484680650554.


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:40:21,439] Trial 5 finished with value: 1.3526764126519266 and parameters: {'emb_size': 512, 'hidden_size': 512, 'num_layers': 2, 'dropout': 0.29181133149366345, 'lr': 0.002955934233203316, 'weight_decay': 4.26342272067657e-06, 'clip_norm': 0.7416619134441844, 'eps': 1.817951520394471e-08, 'beta1': 0.8500000000000001, 'beta2': 0.927, 'scheduler_step_size': 36, 'scheduler_gamma': 0.7302656105444427}. Best is trial 0 with value: 4.13484680650554.




Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:42:44,758] Trial 6 finished with value: 1.3197220545092716 and parameters: {'emb_size': 512, 'hidden_size': 896, 'num_layers': 1, 'dropout': 0.14047766221886154, 'lr': 0.0005964563860207544, 'weight_decay': 5.7745609030786716e-05, 'clip_norm': 0.23534851706761473, 'eps': 2.4239848686723394e-08, 'beta1': 0.8, 'beta2': 0.9470000000000001, 'scheduler_step_size': 29, 'scheduler_gamma': 0.8867556333719415}. Best is trial 0 with value: 4.13484680650554.


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:43:22,312] Trial 7 pruned. 


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:44:06,444] Trial 8 pruned. 


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:45:08,238] Trial 9 pruned. 




Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:47:27,738] Trial 10 finished with value: 8.267362613259646 and parameters: {'emb_size': 256, 'hidden_size': 1024, 'num_layers': 1, 'dropout': 0.24091394220921847, 'lr': 0.004722290732386166, 'weight_decay': 3.5041755783393595e-06, 'clip_norm': 0.10307286306520758, 'eps': 7.783663182427742e-07, 'beta1': 0.93, 'beta2': 0.914, 'scheduler_step_size': 39, 'scheduler_gamma': 0.7611366060708542}. Best is trial 10 with value: 8.267362613259646.




Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:49:51,385] Trial 11 finished with value: 1.7243394600682411 and parameters: {'emb_size': 256, 'hidden_size': 1024, 'num_layers': 1, 'dropout': 0.24198687072816194, 'lr': 0.004756245054007877, 'weight_decay': 3.546480980981928e-06, 'clip_norm': 0.101534998814659, 'eps': 9.114638652626682e-07, 'beta1': 0.9400000000000001, 'beta2': 0.904, 'scheduler_step_size': 40, 'scheduler_gamma': 0.7775389713861152}. Best is trial 10 with value: 8.267362613259646.




Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:52:12,884] Trial 12 finished with value: 2.8414446016875647 and parameters: {'emb_size': 256, 'hidden_size': 1024, 'num_layers': 1, 'dropout': 0.2970519682788218, 'lr': 0.0011784791195514842, 'weight_decay': 2.2798460490492315e-06, 'clip_norm': 0.11656110583521972, 'eps': 9.694604379266096e-08, 'beta1': 0.88, 'beta2': 0.905, 'scheduler_step_size': 40, 'scheduler_gamma': 0.7605231191234234}. Best is trial 10 with value: 8.267362613259646.




Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:53:01,728] Trial 13 pruned. 




Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:53:56,931] Trial 14 pruned. 


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:54:34,490] Trial 15 pruned. 




Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:55:23,310] Trial 16 pruned. 


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:56:30,866] Trial 17 pruned. 




Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:58:50,540] Trial 18 finished with value: 4.3801185999816505 and parameters: {'emb_size': 384, 'hidden_size': 896, 'num_layers': 1, 'dropout': 0.21582231989679768, 'lr': 0.002246436745299442, 'weight_decay': 2.978696350391343e-05, 'clip_norm': 0.15487760422215927, 'eps': 3.8057018330586895e-07, 'beta1': 0.8600000000000001, 'beta2': 0.9430000000000001, 'scheduler_step_size': 33, 'scheduler_gamma': 0.7092799505989668}. Best is trial 10 with value: 8.267362613259646.




Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 04:59:46,085] Trial 19 pruned. 
Tuning done for BiLSTMTranslator @ 50000!
Now training best model


Training:   0%|          | 0/2000 [00:00<?, ?it/s]

Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=25 loss=5.641 BLEU=0.50 ChrF=12.27


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=50 loss=4.605 BLEU=0.66 ChrF=12.19


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=75 loss=4.175 BLEU=1.21 ChrF=14.45


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=100 loss=3.866 BLEU=1.46 ChrF=16.52


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=125 loss=3.657 BLEU=1.96 ChrF=18.59


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=150 loss=3.498 BLEU=2.03 ChrF=18.28


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=175 loss=3.362 BLEU=2.84 ChrF=20.05


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=200 loss=3.247 BLEU=3.07 ChrF=20.76


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=225 loss=3.145 BLEU=3.33 ChrF=20.87


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=250 loss=3.058 BLEU=3.84 ChrF=21.74


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=275 loss=2.987 BLEU=4.21 ChrF=22.31


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=300 loss=2.919 BLEU=4.66 ChrF=23.12


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=325 loss=2.868 BLEU=4.71 ChrF=22.71


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=350 loss=2.820 BLEU=4.93 ChrF=23.33


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=375 loss=2.779 BLEU=5.13 ChrF=23.30


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=400 loss=2.749 BLEU=5.18 ChrF=23.77


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=425 loss=2.721 BLEU=5.35 ChrF=23.96


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=450 loss=2.696 BLEU=5.21 ChrF=24.07


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=475 loss=2.678 BLEU=5.55 ChrF=24.31


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=500 loss=2.661 BLEU=5.57 ChrF=24.43


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=525 loss=2.646 BLEU=5.40 ChrF=24.43


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=550 loss=2.635 BLEU=5.68 ChrF=24.44


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=575 loss=2.625 BLEU=5.43 ChrF=24.38


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=600 loss=2.617 BLEU=5.70 ChrF=24.80


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=625 loss=2.610 BLEU=5.63 ChrF=24.71


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=650 loss=2.604 BLEU=5.86 ChrF=24.89


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=675 loss=2.598 BLEU=5.67 ChrF=24.78


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=700 loss=2.595 BLEU=5.79 ChrF=24.91


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=725 loss=2.591 BLEU=5.82 ChrF=24.82


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=750 loss=2.588 BLEU=5.83 ChrF=25.09


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=775 loss=2.586 BLEU=5.74 ChrF=24.86


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=800 loss=2.584 BLEU=5.85 ChrF=24.96


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=825 loss=2.582 BLEU=5.84 ChrF=24.87


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=850 loss=2.581 BLEU=5.89 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=875 loss=2.579 BLEU=5.90 ChrF=25.14


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=900 loss=2.578 BLEU=5.84 ChrF=25.03


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=925 loss=2.577 BLEU=5.85 ChrF=25.08


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=950 loss=2.576 BLEU=5.82 ChrF=25.05


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=975 loss=2.576 BLEU=5.88 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1000 loss=2.575 BLEU=5.85 ChrF=25.07


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1025 loss=2.574 BLEU=5.78 ChrF=25.00


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1050 loss=2.575 BLEU=5.82 ChrF=25.04


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1075 loss=2.574 BLEU=5.85 ChrF=25.07


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1100 loss=2.574 BLEU=5.88 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1125 loss=2.574 BLEU=5.82 ChrF=25.04


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1150 loss=2.574 BLEU=5.90 ChrF=25.13


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1175 loss=2.573 BLEU=5.92 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1200 loss=2.573 BLEU=5.92 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1225 loss=2.573 BLEU=5.85 ChrF=25.07


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1250 loss=2.573 BLEU=5.87 ChrF=25.07


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1275 loss=2.572 BLEU=5.88 ChrF=25.08


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1300 loss=2.572 BLEU=5.84 ChrF=25.04


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1325 loss=2.572 BLEU=5.86 ChrF=25.09


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1350 loss=2.573 BLEU=5.85 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1375 loss=2.572 BLEU=5.85 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1400 loss=2.572 BLEU=5.85 ChrF=25.09


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1425 loss=2.572 BLEU=5.88 ChrF=25.09


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1450 loss=2.572 BLEU=5.86 ChrF=25.12


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1475 loss=2.572 BLEU=5.86 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1500 loss=2.572 BLEU=5.86 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1525 loss=2.572 BLEU=5.85 ChrF=25.09


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1550 loss=2.572 BLEU=5.86 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1575 loss=2.572 BLEU=5.88 ChrF=25.09


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1600 loss=2.573 BLEU=5.88 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1625 loss=2.573 BLEU=5.87 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1650 loss=2.572 BLEU=5.87 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1675 loss=2.572 BLEU=5.88 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1700 loss=2.572 BLEU=5.88 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1725 loss=2.572 BLEU=5.87 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1750 loss=2.572 BLEU=5.88 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1775 loss=2.572 BLEU=5.87 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1800 loss=2.572 BLEU=5.87 ChrF=25.10


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1825 loss=2.572 BLEU=5.87 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1850 loss=2.572 BLEU=5.87 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1875 loss=2.572 BLEU=5.87 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1900 loss=2.572 BLEU=5.87 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1925 loss=2.572 BLEU=5.87 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1950 loss=2.572 BLEU=5.87 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=1975 loss=2.571 BLEU=5.87 ChrF=25.11


Train batches:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_50000] step=2000 loss=2.572 BLEU=5.88 ChrF=25.11
Training complete for BiLSTMTranslator @ 50000 in 0:37:25.565089
Best BLEU: 5.92, ChrF: 25.11

Dataset size: 75000


[I 2025-07-30 05:37:15,192] A new study created in RDB with name: BiLSTMTranslator_75000


  0%|          | 0/20 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 05:40:28,634] Trial 0 finished with value: 9.445463167722947 and parameters: {'emb_size': 256, 'hidden_size': 1024, 'num_layers': 1, 'dropout': 0.24091394220921847, 'lr': 0.004722290732386166, 'weight_decay': 3.5041755783393595e-06, 'clip_norm': 0.10307286306520758, 'eps': 7.783663182427742e-07, 'beta1': 0.93, 'beta2': 0.914, 'scheduler_step_size': 39, 'scheduler_gamma': 0.7611366060708542}. Best is trial 0 with value: 9.445463167722947.


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 05:44:04,782] Trial 1 finished with value: 1.7174859284360775 and parameters: {'emb_size': 320, 'hidden_size': 896, 'num_layers': 3, 'dropout': 0.1186541633025393, 'lr': 0.0029183275976708313, 'weight_decay': 9.708813801914399e-06, 'clip_norm': 0.3976880461482678, 'eps': 1.8339320664478778e-07, 'beta1': 0.9, 'beta2': 0.994, 'scheduler_step_size': 29, 'scheduler_gamma': 0.8242641327111822}. Best is trial 0 with value: 9.445463167722947.


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 05:47:38,484] Trial 2 finished with value: 0.49375764947070444 and parameters: {'emb_size': 512, 'hidden_size': 1024, 'num_layers': 2, 'dropout': 0.11569205987129767, 'lr': 0.00031535118609259924, 'weight_decay': 4.8380646010385906e-05, 'clip_norm': 0.24467303898569973, 'eps': 3.342034041311524e-07, 'beta1': 0.88, 'beta2': 0.9420000000000001, 'scheduler_step_size': 32, 'scheduler_gamma': 0.7797952229045644}. Best is trial 0 with value: 9.445463167722947.


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 05:49:45,771] Trial 3 finished with value: 0.968149623674386 and parameters: {'emb_size': 448, 'hidden_size': 384, 'num_layers': 2, 'dropout': 0.2506131059065789, 'lr': 0.004707766804344857, 'weight_decay': 1.1834144789421576e-05, 'clip_norm': 0.2250374479245117, 'eps': 2.3302061046575985e-07, 'beta1': 0.99, 'beta2': 0.909, 'scheduler_step_size': 36, 'scheduler_gamma': 0.7577750127792158}. Best is trial 0 with value: 9.445463167722947.


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 05:53:39,503] Trial 4 finished with value: 2.457964811076908 and parameters: {'emb_size': 512, 'hidden_size': 1024, 'num_layers': 3, 'dropout': 0.23700889950926374, 'lr': 0.001801725897645311, 'weight_decay': 5.410527836021289e-06, 'clip_norm': 0.41685988803924157, 'eps': 6.277247011283579e-08, 'beta1': 0.8, 'beta2': 0.976, 'scheduler_step_size': 38, 'scheduler_gamma': 0.8766030432623118}. Best is trial 0 with value: 9.445463167722947.




Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 05:56:46,996] Trial 5 finished with value: 3.0601461723225483 and parameters: {'emb_size': 448, 'hidden_size': 896, 'num_layers': 1, 'dropout': 0.22836538176356638, 'lr': 0.000988535224726006, 'weight_decay': 2.7141251693823122e-05, 'clip_norm': 0.49710021472391375, 'eps': 3.459216135768257e-08, 'beta1': 0.8500000000000001, 'beta2': 0.912, 'scheduler_step_size': 26, 'scheduler_gamma': 0.8296562452313248}. Best is trial 0 with value: 9.445463167722947.


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 05:57:41,984] Trial 6 pruned. 


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 05:58:49,476] Trial 7 pruned. 


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 05:59:39,338] Trial 8 pruned. 


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:00:41,919] Trial 9 pruned. 




Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:01:49,536] Trial 10 pruned. 




Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:03:06,561] Trial 11 pruned. 




Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:05:57,521] Trial 12 finished with value: 3.305922260818568 and parameters: {'emb_size': 256, 'hidden_size': 768, 'num_layers': 1, 'dropout': 0.16930365873895697, 'lr': 0.002309484727124264, 'weight_decay': 3.591758298790829e-06, 'clip_norm': 0.1086933168458383, 'eps': 4.0480521173822826e-09, 'beta1': 0.92, 'beta2': 0.925, 'scheduler_step_size': 23, 'scheduler_gamma': 0.8990634540771807}. Best is trial 0 with value: 9.445463167722947.




Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:08:23,670] Trial 13 finished with value: 4.4014312190813465 and parameters: {'emb_size': 192, 'hidden_size': 640, 'num_layers': 1, 'dropout': 0.15487114086917597, 'lr': 0.002384310362199346, 'weight_decay': 3.4803165238559254e-06, 'clip_norm': 0.10919110663853146, 'eps': 2.892475732457924e-09, 'beta1': 0.92, 'beta2': 0.928, 'scheduler_step_size': 19, 'scheduler_gamma': 0.8937116095805645}. Best is trial 0 with value: 9.445463167722947.




Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:09:26,008] Trial 14 pruned. 




Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:11:56,951] Trial 15 finished with value: 4.659196770559395 and parameters: {'emb_size': 128, 'hidden_size': 640, 'num_layers': 1, 'dropout': 0.14822084283429457, 'lr': 0.004848507073706407, 'weight_decay': 2.2280441466702304e-06, 'clip_norm': 0.2676172207910267, 'eps': 7.520207895693356e-09, 'beta1': 0.91, 'beta2': 0.9470000000000001, 'scheduler_step_size': 18, 'scheduler_gamma': 0.7998771615881806}. Best is trial 0 with value: 9.445463167722947.


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:12:55,340] Trial 16 pruned. 




Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:14:33,799] Trial 17 pruned. 


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:15:33,024] Trial 18 pruned. 




Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 06:18:50,963] Trial 19 finished with value: 5.910872048387518 and parameters: {'emb_size': 128, 'hidden_size': 1024, 'num_layers': 1, 'dropout': 0.20342582016383498, 'lr': 0.0019567091611883826, 'weight_decay': 2.194047145828502e-06, 'clip_norm': 0.36595425448991264, 'eps': 7.374353634941687e-07, 'beta1': 0.89, 'beta2': 0.9650000000000001, 'scheduler_step_size': 26, 'scheduler_gamma': 0.7619472446088366}. Best is trial 0 with value: 9.445463167722947.
Tuning done for BiLSTMTranslator @ 75000!
Now training best model


Training:   0%|          | 0/2000 [00:00<?, ?it/s]

Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=37 loss=5.409 BLEU=0.42 ChrF=11.03


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=74 loss=4.360 BLEU=1.14 ChrF=15.97


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=111 loss=3.915 BLEU=1.46 ChrF=16.50


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=148 loss=3.653 BLEU=2.55 ChrF=18.46


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=185 loss=3.478 BLEU=2.98 ChrF=19.52


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=222 loss=3.339 BLEU=3.35 ChrF=20.46


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=259 loss=3.224 BLEU=3.93 ChrF=21.24


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=296 loss=3.130 BLEU=4.36 ChrF=22.63


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=333 loss=3.055 BLEU=4.77 ChrF=23.05


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=370 loss=2.995 BLEU=5.25 ChrF=23.46


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=407 loss=2.948 BLEU=5.49 ChrF=23.94


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=444 loss=2.912 BLEU=5.83 ChrF=24.48


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=481 loss=2.883 BLEU=6.03 ChrF=24.64


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=518 loss=2.860 BLEU=6.22 ChrF=24.90


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=555 loss=2.844 BLEU=6.29 ChrF=24.70


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=592 loss=2.830 BLEU=6.33 ChrF=24.96


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=629 loss=2.820 BLEU=6.60 ChrF=25.16


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=666 loss=2.811 BLEU=6.57 ChrF=25.15


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=703 loss=2.805 BLEU=6.45 ChrF=25.09


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=740 loss=2.801 BLEU=6.60 ChrF=25.24


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=777 loss=2.796 BLEU=6.44 ChrF=25.21


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=814 loss=2.794 BLEU=6.56 ChrF=25.27


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=851 loss=2.791 BLEU=6.65 ChrF=25.39


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=888 loss=2.789 BLEU=6.65 ChrF=25.28


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=925 loss=2.788 BLEU=6.66 ChrF=25.28


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=962 loss=2.787 BLEU=6.68 ChrF=25.36


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=999 loss=2.787 BLEU=6.62 ChrF=25.31


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1036 loss=2.785 BLEU=6.71 ChrF=25.37


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1073 loss=2.785 BLEU=6.59 ChrF=25.27


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1110 loss=2.785 BLEU=6.70 ChrF=25.30


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1147 loss=2.785 BLEU=6.61 ChrF=25.28


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1184 loss=2.784 BLEU=6.69 ChrF=25.32


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1221 loss=2.784 BLEU=6.71 ChrF=25.35


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1258 loss=2.784 BLEU=6.73 ChrF=25.32


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1295 loss=2.784 BLEU=6.75 ChrF=25.32


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1332 loss=2.784 BLEU=6.76 ChrF=25.34


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1369 loss=2.784 BLEU=6.77 ChrF=25.34


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1406 loss=2.784 BLEU=6.77 ChrF=25.32


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1443 loss=2.784 BLEU=6.75 ChrF=25.34


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1480 loss=2.784 BLEU=6.77 ChrF=25.34


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1517 loss=2.784 BLEU=6.77 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1554 loss=2.784 BLEU=6.77 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1591 loss=2.784 BLEU=6.74 ChrF=25.32


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1628 loss=2.784 BLEU=6.75 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1665 loss=2.784 BLEU=6.75 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1702 loss=2.784 BLEU=6.74 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1739 loss=2.783 BLEU=6.74 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1776 loss=2.784 BLEU=6.75 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1813 loss=2.784 BLEU=6.75 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1850 loss=2.784 BLEU=6.75 ChrF=25.34


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1887 loss=2.784 BLEU=6.74 ChrF=25.35


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1924 loss=2.784 BLEU=6.75 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1961 loss=2.783 BLEU=6.75 ChrF=25.33


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=1998 loss=2.784 BLEU=6.74 ChrF=25.35


Train batches:   0%|          | 0/37 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[BiLSTMTranslator_75000] step=2035 loss=2.784 BLEU=6.75 ChrF=25.33
Training complete for BiLSTMTranslator @ 75000 in 0:34:59.406296
Best BLEU: 6.77, ChrF: 25.33

Dataset size: 100000


[I 2025-07-30 06:53:54,786] A new study created in RDB with name: BiLSTMTranslator_100000


  0%|          | 0/25 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-07-30 07:00:26,523] Trial 0 finished with value: 11.557094977497274 and parameters: {'emb_size': 256, 'hidden_size': 1024, 'num_layers': 1, 'dropout': 0.24091394220921847, 'lr': 0.004722290732386166, 'weight_decay': 3.5041755783393595e-06, 'clip_norm': 0.10307286306520758, 'eps': 7.783663182427742e-07, 'beta1': 0.93, 'beta2': 0.914, 'scheduler_step_size': 39, 'scheduler_gamma': 0.7611366060708542}. Best is trial 0 with value: 11.557094977497274.


Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

Evaluate:   0%|          | 0/1 [00:00<?, ?it/s]

Train batches:   0%|          | 0/49 [00:00<?, ?it/s]

In [None]:
# # Welch T test
# for size in sizes:
#     bilstm_path = train_dir / f"BiLSTMTranslator_{size}_best.pt"
#     transformer_path = train_dir / f"TransformerTranslator_{size}_best.pt"
#     if bilstm_path.exists() and transformer_path.exists():
#         bilstm_model = BiLSTMTranslator(8000, **best_params[size])
#         bilstm_model.load_state_dict(torch.load(bilstm_path, map_location=device))
#         bilstm_model.to(device)

#         transformer_model = TransformerTranslator(8000, **best_params[size])
#         transformer_model.load_state_dict(torch.load(transformer_path, map_location=device))
#         transformer_model.to(device)

#         bilstm_bleu, _ = evaluate(bilstm_model, valid_iter, sp, device)
#         transformer_bleu, _ = evaluate(transformer_model, valid_iter, sp, device)

#         t_stat, p_value = stats.ttest_ind_from_stats(
#             mean1=bilstm_bleu, std1=0.0, nobs1=1,
#             mean2=transformer_bleu, std2=0.0, nobs2=1
#         )
#         print(f"Size {size}: BiLSTM BLEU={bilstm_bleu:.2f}, Transformer BLEU={transformer_bleu:.2f}, T-stat={t_stat:.3f}, p-value={p_value:.3e}")
#     else:
#         print(f"Skipping size {size} due to missing model files.")
#         continue