### Dependencies

In [None]:
!pip install mamba-ssm
!pip install causal-conv1d>=1.4.0

In [None]:
!pip install numpy tqdm transformers datasets

In [None]:
!pip install wandb

### Dataset

In [15]:
import torch
from datasets import load_dataset
from transformers import GPT2Tokenizer

def load_and_preprocess_data(max_length=128, stride=64):
    dataset = load_dataset("wikitext", "wikitext-2-v1")
    
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    
    def tokenize_function(examples):
        tokenized_inputs = tokenizer(
            examples["text"],
            truncation=True,
            max_length=max_length,
            return_overflowing_tokens=True,
            return_length=True,
            stride=stride,
        )
        
        input_batch = []
        for length, input_ids in zip(tokenized_inputs["length"], tokenized_inputs["input_ids"]):
            if length == max_length:
                input_batch.append(input_ids)
        
        return {"input_ids": input_batch}
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    tokenized_dataset.set_format(type="torch")
    
    return tokenized_dataset, tokenizer

def create_dataloaders(dataset, batch_size=32):
    train_dataloader = torch.utils.data.DataLoader(dataset["train"], batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(dataset["validation"], batch_size=batch_size)
    test_dataloader = torch.utils.data.DataLoader(dataset["test"], batch_size=batch_size)
    
    return train_dataloader, val_dataloader, test_dataloader

In [16]:
dataset, tokenizer = load_and_preprocess_data()

In [17]:
train_dataloader, val_dataloader, test_dataloader = create_dataloaders(dataset)

print(f"Vocabulary size: {len(tokenizer)}")
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")

Vocabulary size: 50257
Train samples: 8624
Validation samples: 922
Test samples: 1017


In [20]:
# Check a sample batch
for batch in train_dataloader:
    print("Sample batch shape:", batch["input_ids"].shape)
    print("Sample input:", tokenizer.decode(batch["input_ids"][0]))
    break

Sample batch shape: torch.Size([32, 128])
Sample input:  Several terrestrial starlings, including those in the genus Sturnus, have adaptations of the skull and muscles that help with feeding by probing. This adaptation is most strongly developed in the common starling ( along with the spotless and white @-@ <unk> starlings ), where the <unk> muscles responsible for opening the jaw are enlarged and the skull is narrow, allowing the eye to be moved forward to peer down the length of the bill. This technique involves inserting the bill into the ground and opening it as a way of searching for hidden food items. Common starlings have the physical traits that enable them to use this feeding


### Mamba Implementation

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from mamba_ssm import Mamba

class MambaBlock(nn.Module):
    def __init__(self, d_model, d_state, d_conv, expand):
        super().__init__()
        self.mamba = Mamba(
            d_model=d_model,
            d_state=d_state,
            d_conv=d_conv,
            expand=expand
        )
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x):
        return self.norm(x + self.mamba(x))

class MambaLM(nn.Module):
    def __init__(self, vocab_size, d_model, n_layer, d_state, d_conv, expand):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.layers = nn.ModuleList([
            MambaBlock(d_model, d_state, d_conv, expand)
            for _ in range(n_layer)
        ])
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)

    def forward(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        return self.lm_head(x)

def create_mamba_model(vocab_size, d_model=256, n_layer=4, d_state=16, d_conv=4, expand=2):
    return MambaLM(vocab_size, d_model, n_layer, d_state, d_conv, expand)

### Training

In [27]:
import wandb
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc


True

In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import wandb
import math
# from data_preparation import load_and_preprocess_data, create_dataloaders
# from mamba_model import create_mamba_model

def compute_metrics(logits, targets):
    loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
    perplexity = math.exp(loss.item())
    predictions = logits.argmax(dim=-1)
    accuracy = (predictions == targets).float().mean().item()
    return {
        'loss': loss,  # Return the tensor, not the item
        'loss_value': loss.item(),  # Add this for logging
        'perplexity': perplexity,
        'accuracy': accuracy
    }

def train(model, train_dataloader, val_dataloader, num_epochs, lr, device):
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

    global_step = 0
    for epoch in range(num_epochs):
        model.train()
        epoch_metrics = {'train_loss': 0, 'train_perplexity': 0, 'train_accuracy': 0}
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            targets = input_ids.clone()
            targets[:, :-1] = input_ids[:, 1:]
            targets[:, -1] = input_ids[:, 0]

            optimizer.zero_grad()
            outputs = model(input_ids)
            metrics = compute_metrics(outputs, targets)
            loss = metrics['loss']  # This is now a tensor
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            global_step += 1

            # Update epoch metrics
            epoch_metrics['train_loss'] += metrics['loss_value']
            epoch_metrics['train_perplexity'] += metrics['perplexity']
            epoch_metrics['train_accuracy'] += metrics['accuracy']

            if global_step % 50 == 0:
                wandb.log({
                    "train_loss": metrics['loss_value'],
                    "train_perplexity": metrics['perplexity'],
                    "train_accuracy": metrics['accuracy']
                }, step=global_step)

            if global_step % 250 == 0:
                val_metrics = evaluate(model, val_dataloader, device)
                wandb.log(val_metrics, step=global_step)
                model.train()  # Switch back to train mode after evaluation

        # Log epoch-level metrics
        epoch_metrics = {k: v / len(train_dataloader) for k, v in epoch_metrics.items()}
        wandb.log(epoch_metrics, step=global_step)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_metrics['train_loss']:.4f}, "
              f"Train Perplexity: {epoch_metrics['train_perplexity']:.4f}, "
              f"Train Accuracy: {epoch_metrics['train_accuracy']:.4f}")

        scheduler.step()

    wandb.finish()
    return model

def evaluate(model, dataloader, device):
    model.eval()
    total_metrics = {'val_loss': 0, 'val_perplexity': 0, 'val_accuracy': 0}
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            targets = input_ids.clone()
            targets[:, :-1] = input_ids[:, 1:]
            targets[:, -1] = input_ids[:, 0]

            outputs = model(input_ids)
            metrics = compute_metrics(outputs, targets)
            total_metrics['val_loss'] += metrics['loss_value']
            total_metrics['val_perplexity'] += metrics['perplexity']
            total_metrics['val_accuracy'] += metrics['accuracy']

    avg_metrics = {k: v / len(dataloader) for k, v in total_metrics.items()}
    print(f"Validation Loss: {avg_metrics['val_loss']:.4f}, "
          f"Validation Perplexity: {avg_metrics['val_perplexity']:.4f}, "
          f"Validation Accuracy: {avg_metrics['val_accuracy']:.4f}")
    return avg_metrics

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

num_epochs = 100
lr = 1e-4
batch_size = 32
d_model=512
n_layer=8
d_state=64
d_conv=4
expand=2

wandb.init(project="mamba-next-word-prediction", config={
    "d_model": d_model,
    "n_layer": n_layer,
    "d_state": d_state,
    "d_conv": d_conv,
    "expand": expand,
    "learning_rate": lr,
    "epochs": num_epochs,
    "batch_size": batch_size,
})

Using device: cuda


In [51]:
dataset, tokenizer = load_and_preprocess_data()
train_dataloader, val_dataloader, _ = create_dataloaders(dataset, batch_size=batch_size)

In [52]:
vocab_size = len(tokenizer)
model = create_mamba_model(
    vocab_size,
    d_model,
    n_layer,
    d_state,
    d_conv,
    expand
)

print(f"Created Mamba model with parameters:")
print(f"d_model: {d_model}")
print(f"n_layer: {n_layer}")
print(f"d_state: {d_state}")
print(f"d_conv: {d_conv}")
print(f"expand: {expand}")

Created Mamba model with parameters:
d_model: 512
n_layer: 8
d_state: 64
d_conv: 4
expand: 2


In [53]:
trained_model = train(model, train_dataloader, val_dataloader, num_epochs, lr, device)

# Save the trained model
torch.save(trained_model.state_dict(), "mamba_lm.pth")

Epoch 1/100:  93%|█████████▎| 250/270 [00:54<00:16,  1.19it/s]

Validation Loss: 5.5899, Validation Perplexity: 272.8824, Validation Accuracy: 0.2610


Epoch 1/100: 100%|██████████| 270/270 [00:58<00:00,  4.58it/s]


Epoch 1/100, Train Loss: 7.0460, Train Perplexity: 4470.3839, Train Accuracy: 0.1622


Epoch 2/100:  85%|████████▌ | 230/270 [00:51<00:34,  1.15it/s]

Validation Loss: 5.2705, Validation Perplexity: 197.6050, Validation Accuracy: 0.2821


Epoch 2/100: 100%|██████████| 270/270 [01:00<00:00,  4.48it/s]


Epoch 2/100, Train Loss: 5.8220, Train Perplexity: 341.4100, Train Accuracy: 0.2263


Epoch 3/100:  78%|███████▊  | 210/270 [00:48<00:52,  1.14it/s]

Validation Loss: 5.1041, Validation Perplexity: 167.0536, Validation Accuracy: 0.2895


Epoch 3/100: 100%|██████████| 270/270 [01:01<00:00,  4.40it/s]


Epoch 3/100, Train Loss: 5.3803, Train Perplexity: 218.2080, Train Accuracy: 0.2567


Epoch 4/100:  70%|███████   | 190/270 [00:44<01:10,  1.13it/s]

Validation Loss: 5.0244, Validation Perplexity: 154.0396, Validation Accuracy: 0.2932


Epoch 4/100: 100%|██████████| 270/270 [01:01<00:00,  4.37it/s]


Epoch 4/100, Train Loss: 5.0263, Train Perplexity: 153.0196, Train Accuracy: 0.2841


Epoch 5/100:  63%|██████▎   | 170/270 [00:39<01:28,  1.14it/s]

Validation Loss: 4.9947, Validation Perplexity: 149.5249, Validation Accuracy: 0.2941


Epoch 5/100: 100%|██████████| 270/270 [01:01<00:00,  4.36it/s]


Epoch 5/100, Train Loss: 4.7232, Train Perplexity: 112.8929, Train Accuracy: 0.3118


Epoch 6/100:  56%|█████▌    | 150/270 [00:35<01:46,  1.13it/s]

Validation Loss: 5.0224, Validation Perplexity: 153.8260, Validation Accuracy: 0.2908


Epoch 6/100: 100%|██████████| 270/270 [01:02<00:00,  4.35it/s]


Epoch 6/100, Train Loss: 4.4420, Train Perplexity: 85.2747, Train Accuracy: 0.3411


Epoch 7/100:  48%|████▊     | 130/270 [00:31<02:04,  1.12it/s]

Validation Loss: 5.0808, Validation Perplexity: 163.1390, Validation Accuracy: 0.2872


Epoch 7/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 7/100, Train Loss: 4.1695, Train Perplexity: 64.8800, Train Accuracy: 0.3726


Epoch 8/100:  41%|████      | 110/270 [00:26<02:22,  1.13it/s]

Validation Loss: 5.1669, Validation Perplexity: 178.0275, Validation Accuracy: 0.2831


Epoch 8/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 8/100, Train Loss: 3.9057, Train Perplexity: 49.8433, Train Accuracy: 0.4059


Epoch 9/100:  33%|███▎      | 90/270 [00:22<02:40,  1.12it/s]

Validation Loss: 5.2955, Validation Perplexity: 202.6217, Validation Accuracy: 0.2748


Epoch 9/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 9/100, Train Loss: 3.6476, Train Perplexity: 38.5139, Train Accuracy: 0.4391


Epoch 10/100:  26%|██▌       | 70/270 [00:17<02:58,  1.12it/s]

Validation Loss: 5.4369, Validation Perplexity: 233.7061, Validation Accuracy: 0.2708


Epoch 10/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 10/100, Train Loss: 3.3992, Train Perplexity: 30.0428, Train Accuracy: 0.4715


Epoch 11/100:  19%|█▊        | 50/270 [00:13<03:15,  1.12it/s]

Validation Loss: 5.5934, Validation Perplexity: 273.4164, Validation Accuracy: 0.2637


Epoch 11/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 11/100, Train Loss: 3.1579, Train Perplexity: 23.5806, Train Accuracy: 0.5021


Epoch 12/100:  11%|█         | 30/270 [00:08<03:33,  1.13it/s]

Validation Loss: 5.7546, Validation Perplexity: 321.9252, Validation Accuracy: 0.2601


Epoch 12/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 12/100, Train Loss: 2.9255, Train Perplexity: 18.6919, Train Accuracy: 0.5318


Epoch 13/100:   4%|▎         | 10/270 [00:04<03:55,  1.10it/s]

Validation Loss: 5.9191, Validation Perplexity: 379.8596, Validation Accuracy: 0.2557


Epoch 13/100:  96%|█████████▋| 260/270 [01:02<00:08,  1.12it/s]

Validation Loss: 6.0376, Validation Perplexity: 428.2708, Validation Accuracy: 0.2527


Epoch 13/100: 100%|██████████| 270/270 [01:04<00:00,  4.20it/s]


Epoch 13/100, Train Loss: 2.7012, Train Perplexity: 14.9389, Train Accuracy: 0.5614


Epoch 14/100:  89%|████████▉ | 240/270 [00:55<00:26,  1.13it/s]

Validation Loss: 6.2348, Validation Perplexity: 521.8341, Validation Accuracy: 0.2491


Epoch 14/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 14/100, Train Loss: 2.4828, Train Perplexity: 12.0072, Train Accuracy: 0.5907


Epoch 15/100:  81%|████████▏ | 220/270 [00:51<00:44,  1.12it/s]

Validation Loss: 6.4407, Validation Perplexity: 642.6915, Validation Accuracy: 0.2454


Epoch 15/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 15/100, Train Loss: 2.2718, Train Perplexity: 9.7180, Train Accuracy: 0.6209


Epoch 16/100:  74%|███████▍  | 200/270 [00:46<01:02,  1.12it/s]

Validation Loss: 6.6535, Validation Perplexity: 795.9351, Validation Accuracy: 0.2419


Epoch 16/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 16/100, Train Loss: 2.0682, Train Perplexity: 7.9273, Train Accuracy: 0.6519


Epoch 17/100:  67%|██████▋   | 180/270 [00:42<01:20,  1.12it/s]

Validation Loss: 6.8666, Validation Perplexity: 985.9895, Validation Accuracy: 0.2366


Epoch 17/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 17/100, Train Loss: 1.8713, Train Perplexity: 6.5099, Train Accuracy: 0.6837


Epoch 18/100:  59%|█████▉    | 160/270 [00:38<01:38,  1.12it/s]

Validation Loss: 7.0425, Validation Perplexity: 1174.4399, Validation Accuracy: 0.2345


Epoch 18/100: 100%|██████████| 270/270 [01:02<00:00,  4.31it/s]


Epoch 18/100, Train Loss: 1.6792, Train Perplexity: 5.3711, Train Accuracy: 0.7176


Epoch 19/100:  52%|█████▏    | 140/270 [00:33<01:56,  1.12it/s]

Validation Loss: 7.2643, Validation Perplexity: 1467.9924, Validation Accuracy: 0.2338


Epoch 19/100: 100%|██████████| 270/270 [01:02<00:00,  4.31it/s]


Epoch 19/100, Train Loss: 1.4950, Train Perplexity: 4.4678, Train Accuracy: 0.7515


Epoch 20/100:  44%|████▍     | 120/270 [00:29<02:14,  1.12it/s]

Validation Loss: 7.4575, Validation Perplexity: 1787.4550, Validation Accuracy: 0.2311


Epoch 20/100: 100%|██████████| 270/270 [01:02<00:00,  4.31it/s]


Epoch 20/100, Train Loss: 1.3178, Train Perplexity: 3.7412, Train Accuracy: 0.7860


Epoch 21/100:  37%|███▋      | 100/270 [00:24<02:31,  1.12it/s]

Validation Loss: 7.6453, Validation Perplexity: 2160.0187, Validation Accuracy: 0.2286


Epoch 21/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 21/100, Train Loss: 1.1499, Train Perplexity: 3.1623, Train Accuracy: 0.8201


Epoch 22/100:  30%|██▉       | 80/270 [00:20<02:49,  1.12it/s]

Validation Loss: 7.8433, Validation Perplexity: 2636.1132, Validation Accuracy: 0.2277


Epoch 22/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 22/100, Train Loss: 0.9926, Train Perplexity: 2.7020, Train Accuracy: 0.8520


Epoch 23/100:  22%|██▏       | 60/270 [00:15<03:07,  1.12it/s]

Validation Loss: 8.0523, Validation Perplexity: 3254.3222, Validation Accuracy: 0.2258


Epoch 23/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 23/100, Train Loss: 0.8481, Train Perplexity: 2.3378, Train Accuracy: 0.8815


Epoch 24/100:  15%|█▍        | 40/270 [00:11<03:24,  1.13it/s]

Validation Loss: 8.2427, Validation Perplexity: 3941.8430, Validation Accuracy: 0.2247


Epoch 24/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 24/100, Train Loss: 0.7163, Train Perplexity: 2.0487, Train Accuracy: 0.9073


Epoch 25/100:   7%|▋         | 20/270 [00:06<03:45,  1.11it/s]

Validation Loss: 8.4134, Validation Perplexity: 4677.5918, Validation Accuracy: 0.2227


Epoch 25/100: 100%|██████████| 270/270 [01:03<00:00,  4.22it/s]


Validation Loss: 8.5586, Validation Perplexity: 5412.2009, Validation Accuracy: 0.2227
Epoch 25/100, Train Loss: 0.5988, Train Perplexity: 1.8213, Train Accuracy: 0.9290


Epoch 26/100:  93%|█████████▎| 250/270 [00:57<00:18,  1.11it/s]

Validation Loss: 8.7508, Validation Perplexity: 6593.1623, Validation Accuracy: 0.2228


Epoch 26/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 26/100, Train Loss: 0.4962, Train Perplexity: 1.6437, Train Accuracy: 0.9465


Epoch 27/100:  85%|████████▌ | 230/270 [00:53<00:35,  1.12it/s]

Validation Loss: 8.8977, Validation Perplexity: 7633.7242, Validation Accuracy: 0.2201


Epoch 27/100: 100%|██████████| 270/270 [01:02<00:00,  4.31it/s]


Epoch 27/100, Train Loss: 0.4078, Train Perplexity: 1.5043, Train Accuracy: 0.9598


Epoch 28/100:  78%|███████▊  | 210/270 [00:49<00:53,  1.13it/s]

Validation Loss: 9.0597, Validation Perplexity: 8993.4580, Validation Accuracy: 0.2226


Epoch 28/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 28/100, Train Loss: 0.3367, Train Perplexity: 1.4008, Train Accuracy: 0.9689


Epoch 29/100:  70%|███████   | 190/270 [00:45<01:12,  1.11it/s]

Validation Loss: 9.2377, Validation Perplexity: 10741.2312, Validation Accuracy: 0.2202


Epoch 29/100: 100%|██████████| 270/270 [01:02<00:00,  4.30it/s]


Epoch 29/100, Train Loss: 0.2802, Train Perplexity: 1.3239, Train Accuracy: 0.9750


Epoch 30/100:  63%|██████▎   | 170/270 [00:40<01:29,  1.12it/s]

Validation Loss: 9.3559, Validation Perplexity: 12122.3842, Validation Accuracy: 0.2206


Epoch 30/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 30/100, Train Loss: 0.2355, Train Perplexity: 1.2658, Train Accuracy: 0.9789


Epoch 31/100:  56%|█████▌    | 150/270 [00:35<01:46,  1.12it/s]

Validation Loss: 9.4557, Validation Perplexity: 13428.1301, Validation Accuracy: 0.2198


Epoch 31/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 31/100, Train Loss: 0.2025, Train Perplexity: 1.2246, Train Accuracy: 0.9813


Epoch 32/100:  48%|████▊     | 130/270 [00:31<02:03,  1.14it/s]

Validation Loss: 9.5750, Validation Perplexity: 15143.0144, Validation Accuracy: 0.2197


Epoch 32/100: 100%|██████████| 270/270 [01:01<00:00,  4.36it/s]


Epoch 32/100, Train Loss: 0.1783, Train Perplexity: 1.1954, Train Accuracy: 0.9824


Epoch 33/100:  41%|████      | 110/270 [00:26<02:20,  1.14it/s]

Validation Loss: 9.7341, Validation Perplexity: 17768.0918, Validation Accuracy: 0.2194


Epoch 33/100: 100%|██████████| 270/270 [01:01<00:00,  4.37it/s]


Epoch 33/100, Train Loss: 0.1585, Train Perplexity: 1.1719, Train Accuracy: 0.9834


Epoch 34/100:  33%|███▎      | 90/270 [00:22<02:42,  1.11it/s]

Validation Loss: 9.8036, Validation Perplexity: 19044.4191, Validation Accuracy: 0.2214


Epoch 34/100: 100%|██████████| 270/270 [01:02<00:00,  4.29it/s]


Epoch 34/100, Train Loss: 0.1427, Train Perplexity: 1.1535, Train Accuracy: 0.9842


Epoch 35/100:  26%|██▌       | 70/270 [00:17<02:58,  1.12it/s]

Validation Loss: 9.9029, Validation Perplexity: 21040.6980, Validation Accuracy: 0.2195


Epoch 35/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 35/100, Train Loss: 0.1308, Train Perplexity: 1.1398, Train Accuracy: 0.9845


Epoch 36/100:  19%|█▊        | 50/270 [00:13<03:17,  1.11it/s]

Validation Loss: 10.0193, Validation Perplexity: 23655.9087, Validation Accuracy: 0.2200


Epoch 36/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 36/100, Train Loss: 0.1208, Train Perplexity: 1.1285, Train Accuracy: 0.9852


Epoch 37/100:  11%|█         | 30/270 [00:08<03:35,  1.11it/s]

Validation Loss: 10.1107, Validation Perplexity: 25966.7595, Validation Accuracy: 0.2201


Epoch 37/100: 100%|██████████| 270/270 [01:00<00:00,  4.43it/s]


Epoch 37/100, Train Loss: 0.1122, Train Perplexity: 1.1188, Train Accuracy: 0.9854


Epoch 38/100:   4%|▎         | 10/270 [00:04<03:44,  1.16it/s]

Validation Loss: 10.1345, Validation Perplexity: 26556.9971, Validation Accuracy: 0.2203


Epoch 38/100:  96%|█████████▋| 260/270 [00:59<00:08,  1.18it/s]

Validation Loss: 10.2323, Validation Perplexity: 29321.7652, Validation Accuracy: 0.2203


Epoch 38/100: 100%|██████████| 270/270 [01:01<00:00,  4.37it/s]


Epoch 38/100, Train Loss: 0.1049, Train Perplexity: 1.1107, Train Accuracy: 0.9858


Epoch 39/100:  89%|████████▉ | 240/270 [00:54<00:26,  1.15it/s]

Validation Loss: 10.2964, Validation Perplexity: 31305.6758, Validation Accuracy: 0.2205


Epoch 39/100: 100%|██████████| 270/270 [01:00<00:00,  4.47it/s]


Epoch 39/100, Train Loss: 0.0992, Train Perplexity: 1.1043, Train Accuracy: 0.9860


Epoch 40/100:  81%|████████▏ | 220/270 [00:50<00:44,  1.11it/s]

Validation Loss: 10.3851, Validation Perplexity: 34281.5731, Validation Accuracy: 0.2206


Epoch 40/100: 100%|██████████| 270/270 [01:02<00:00,  4.35it/s]


Epoch 40/100, Train Loss: 0.0936, Train Perplexity: 1.0982, Train Accuracy: 0.9863


Epoch 41/100:  74%|███████▍  | 200/270 [00:47<01:03,  1.10it/s]

Validation Loss: 10.4322, Validation Perplexity: 35923.4659, Validation Accuracy: 0.2211


Epoch 41/100: 100%|██████████| 270/270 [01:03<00:00,  4.28it/s]


Epoch 41/100, Train Loss: 0.0889, Train Perplexity: 1.0930, Train Accuracy: 0.9866


Epoch 42/100:  67%|██████▋   | 180/270 [00:42<01:19,  1.14it/s]

Validation Loss: 10.5255, Validation Perplexity: 39549.4711, Validation Accuracy: 0.2220


Epoch 42/100: 100%|██████████| 270/270 [01:01<00:00,  4.38it/s]


Epoch 42/100, Train Loss: 0.0846, Train Perplexity: 1.0883, Train Accuracy: 0.9868


Epoch 43/100:  59%|█████▉    | 160/270 [00:37<01:38,  1.12it/s]

Validation Loss: 10.5686, Validation Perplexity: 41332.8713, Validation Accuracy: 0.2200


Epoch 43/100: 100%|██████████| 270/270 [01:02<00:00,  4.35it/s]


Epoch 43/100, Train Loss: 0.0803, Train Perplexity: 1.0836, Train Accuracy: 0.9872


Epoch 44/100:  52%|█████▏    | 140/270 [00:33<01:54,  1.14it/s]

Validation Loss: 10.6672, Validation Perplexity: 45657.7532, Validation Accuracy: 0.2206


Epoch 44/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 44/100, Train Loss: 0.0775, Train Perplexity: 1.0806, Train Accuracy: 0.9874


Epoch 45/100:  44%|████▍     | 120/270 [00:28<02:13,  1.12it/s]

Validation Loss: 10.7298, Validation Perplexity: 48554.3280, Validation Accuracy: 0.2213


Epoch 45/100: 100%|██████████| 270/270 [01:02<00:00,  4.31it/s]


Epoch 45/100, Train Loss: 0.0740, Train Perplexity: 1.0768, Train Accuracy: 0.9876


Epoch 46/100:  37%|███▋      | 100/270 [00:24<02:28,  1.15it/s]

Validation Loss: 10.7314, Validation Perplexity: 48758.8404, Validation Accuracy: 0.2204


Epoch 46/100: 100%|██████████| 270/270 [01:01<00:00,  4.41it/s]


Epoch 46/100, Train Loss: 0.0718, Train Perplexity: 1.0744, Train Accuracy: 0.9877


Epoch 47/100:  30%|██▉       | 80/270 [00:19<02:45,  1.15it/s]

Validation Loss: 10.8214, Validation Perplexity: 53280.4830, Validation Accuracy: 0.2202


Epoch 47/100: 100%|██████████| 270/270 [01:01<00:00,  4.38it/s]


Epoch 47/100, Train Loss: 0.0688, Train Perplexity: 1.0712, Train Accuracy: 0.9880


Epoch 48/100:  22%|██▏       | 60/270 [00:15<03:09,  1.11it/s]

Validation Loss: 10.8960, Validation Perplexity: 57327.1424, Validation Accuracy: 0.2203


Epoch 48/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 48/100, Train Loss: 0.0667, Train Perplexity: 1.0690, Train Accuracy: 0.9880


Epoch 49/100:  15%|█▍        | 40/270 [00:11<03:26,  1.11it/s]

Validation Loss: 10.9521, Validation Perplexity: 60636.7381, Validation Accuracy: 0.2197


Epoch 49/100: 100%|██████████| 270/270 [01:02<00:00,  4.30it/s]


Epoch 49/100, Train Loss: 0.0641, Train Perplexity: 1.0662, Train Accuracy: 0.9883


Epoch 50/100:   7%|▋         | 20/270 [00:06<03:44,  1.12it/s]

Validation Loss: 11.0153, Validation Perplexity: 64546.0574, Validation Accuracy: 0.2217


Epoch 50/100: 100%|██████████| 270/270 [01:05<00:00,  4.15it/s]


Validation Loss: 11.0686, Validation Perplexity: 68265.1588, Validation Accuracy: 0.2220
Epoch 50/100, Train Loss: 0.0620, Train Perplexity: 1.0640, Train Accuracy: 0.9884


Epoch 51/100:  93%|█████████▎| 250/270 [00:58<00:17,  1.12it/s]

Validation Loss: 11.0928, Validation Perplexity: 69963.8522, Validation Accuracy: 0.2200


Epoch 51/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 51/100, Train Loss: 0.0604, Train Perplexity: 1.0623, Train Accuracy: 0.9885


Epoch 52/100:  85%|████████▌ | 230/270 [00:53<00:35,  1.14it/s]

Validation Loss: 11.1163, Validation Perplexity: 71815.4840, Validation Accuracy: 0.2220


Epoch 52/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 52/100, Train Loss: 0.0583, Train Perplexity: 1.0601, Train Accuracy: 0.9887


Epoch 53/100:  78%|███████▊  | 210/270 [00:48<00:53,  1.13it/s]

Validation Loss: 11.1859, Validation Perplexity: 77258.9316, Validation Accuracy: 0.2229


Epoch 53/100: 100%|██████████| 270/270 [01:01<00:00,  4.38it/s]


Epoch 53/100, Train Loss: 0.0567, Train Perplexity: 1.0584, Train Accuracy: 0.9888


Epoch 54/100:  70%|███████   | 190/270 [00:44<01:11,  1.12it/s]

Validation Loss: 11.2170, Validation Perplexity: 79552.9142, Validation Accuracy: 0.2202


Epoch 54/100: 100%|██████████| 270/270 [01:02<00:00,  4.31it/s]


Epoch 54/100, Train Loss: 0.0549, Train Perplexity: 1.0565, Train Accuracy: 0.9889


Epoch 55/100:  63%|██████▎   | 170/270 [00:40<01:29,  1.11it/s]

Validation Loss: 11.2821, Validation Perplexity: 84854.5631, Validation Accuracy: 0.2216


Epoch 55/100: 100%|██████████| 270/270 [01:02<00:00,  4.29it/s]


Epoch 55/100, Train Loss: 0.0536, Train Perplexity: 1.0551, Train Accuracy: 0.9890


Epoch 56/100:  56%|█████▌    | 150/270 [00:35<01:46,  1.13it/s]

Validation Loss: 11.3157, Validation Perplexity: 87854.8588, Validation Accuracy: 0.2205


Epoch 56/100: 100%|██████████| 270/270 [01:02<00:00,  4.35it/s]


Epoch 56/100, Train Loss: 0.0523, Train Perplexity: 1.0537, Train Accuracy: 0.9890


Epoch 57/100:  48%|████▊     | 130/270 [00:31<02:07,  1.10it/s]

Validation Loss: 11.3273, Validation Perplexity: 88856.2155, Validation Accuracy: 0.2215


Epoch 57/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 57/100, Train Loss: 0.0499, Train Perplexity: 1.0512, Train Accuracy: 0.9893


Epoch 58/100:  41%|████      | 110/270 [00:26<02:17,  1.16it/s]

Validation Loss: 11.3797, Validation Perplexity: 93613.8868, Validation Accuracy: 0.2209


Epoch 58/100: 100%|██████████| 270/270 [01:00<00:00,  4.45it/s]


Epoch 58/100, Train Loss: 0.0483, Train Perplexity: 1.0495, Train Accuracy: 0.9894


Epoch 59/100:  33%|███▎      | 90/270 [00:21<02:37,  1.15it/s]

Validation Loss: 11.4104, Validation Perplexity: 96947.4809, Validation Accuracy: 0.2206


Epoch 59/100: 100%|██████████| 270/270 [01:01<00:00,  4.38it/s]


Epoch 59/100, Train Loss: 0.0472, Train Perplexity: 1.0484, Train Accuracy: 0.9895


Epoch 60/100:  26%|██▌       | 70/270 [00:18<03:01,  1.10it/s]

Validation Loss: 11.4558, Validation Perplexity: 101455.3651, Validation Accuracy: 0.2205


Epoch 60/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 60/100, Train Loss: 0.0461, Train Perplexity: 1.0472, Train Accuracy: 0.9896


Epoch 61/100:  19%|█▊        | 50/270 [00:13<03:12,  1.14it/s]

Validation Loss: 11.4645, Validation Perplexity: 102545.8758, Validation Accuracy: 0.2215


Epoch 61/100: 100%|██████████| 270/270 [01:01<00:00,  4.38it/s]


Epoch 61/100, Train Loss: 0.0454, Train Perplexity: 1.0464, Train Accuracy: 0.9896


Epoch 62/100:  11%|█         | 30/270 [00:08<03:33,  1.12it/s]

Validation Loss: 11.4887, Validation Perplexity: 104678.6894, Validation Accuracy: 0.2197


Epoch 62/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 62/100, Train Loss: 0.0442, Train Perplexity: 1.0452, Train Accuracy: 0.9897


Epoch 63/100:   4%|▎         | 10/270 [00:04<03:53,  1.11it/s]

Validation Loss: 11.5016, Validation Perplexity: 106333.3267, Validation Accuracy: 0.2208


Epoch 63/100:  96%|█████████▋| 260/270 [01:01<00:08,  1.12it/s]

Validation Loss: 11.5266, Validation Perplexity: 108746.0747, Validation Accuracy: 0.2216


Epoch 63/100: 100%|██████████| 270/270 [01:03<00:00,  4.24it/s]


Epoch 63/100, Train Loss: 0.0436, Train Perplexity: 1.0446, Train Accuracy: 0.9897


Epoch 64/100:  89%|████████▉ | 240/270 [00:56<00:26,  1.12it/s]

Validation Loss: 11.5202, Validation Perplexity: 108440.8836, Validation Accuracy: 0.2218


Epoch 64/100: 100%|██████████| 270/270 [01:02<00:00,  4.30it/s]


Epoch 64/100, Train Loss: 0.0430, Train Perplexity: 1.0440, Train Accuracy: 0.9897


Epoch 65/100:  81%|████████▏ | 220/270 [00:51<00:44,  1.11it/s]

Validation Loss: 11.5428, Validation Perplexity: 110595.3688, Validation Accuracy: 0.2211


Epoch 65/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 65/100, Train Loss: 0.0427, Train Perplexity: 1.0436, Train Accuracy: 0.9898


Epoch 66/100:  74%|███████▍  | 200/270 [00:46<01:00,  1.16it/s]

Validation Loss: 11.5834, Validation Perplexity: 115341.3548, Validation Accuracy: 0.2214


Epoch 66/100: 100%|██████████| 270/270 [01:01<00:00,  4.39it/s]


Epoch 66/100, Train Loss: 0.0425, Train Perplexity: 1.0434, Train Accuracy: 0.9898


Epoch 67/100:  67%|██████▋   | 180/270 [00:41<01:17,  1.16it/s]

Validation Loss: 11.5737, Validation Perplexity: 113977.1420, Validation Accuracy: 0.2203


Epoch 67/100: 100%|██████████| 270/270 [01:00<00:00,  4.45it/s]


Epoch 67/100, Train Loss: 0.0429, Train Perplexity: 1.0438, Train Accuracy: 0.9897


Epoch 68/100:  59%|█████▉    | 160/270 [00:37<01:38,  1.12it/s]

Validation Loss: 11.5853, Validation Perplexity: 115613.4561, Validation Accuracy: 0.2206


Epoch 68/100: 100%|██████████| 270/270 [01:02<00:00,  4.35it/s]


Epoch 68/100, Train Loss: 0.0424, Train Perplexity: 1.0434, Train Accuracy: 0.9897


Epoch 69/100:  52%|█████▏    | 140/270 [00:33<01:55,  1.13it/s]

Validation Loss: 11.5849, Validation Perplexity: 115595.6376, Validation Accuracy: 0.2204


Epoch 69/100: 100%|██████████| 270/270 [01:01<00:00,  4.36it/s]


Epoch 69/100, Train Loss: 0.0417, Train Perplexity: 1.0426, Train Accuracy: 0.9898


Epoch 70/100:  44%|████▍     | 120/270 [00:29<02:15,  1.11it/s]

Validation Loss: 11.6062, Validation Perplexity: 117872.1869, Validation Accuracy: 0.2206


Epoch 70/100: 100%|██████████| 270/270 [01:02<00:00,  4.30it/s]


Epoch 70/100, Train Loss: 0.0412, Train Perplexity: 1.0421, Train Accuracy: 0.9898


Epoch 71/100:  37%|███▋      | 100/270 [00:24<02:31,  1.12it/s]

Validation Loss: 11.6170, Validation Perplexity: 119366.2002, Validation Accuracy: 0.2208


Epoch 71/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 71/100, Train Loss: 0.0408, Train Perplexity: 1.0416, Train Accuracy: 0.9899


Epoch 72/100:  30%|██▉       | 80/270 [00:20<02:51,  1.11it/s]

Validation Loss: 11.6564, Validation Perplexity: 124112.9136, Validation Accuracy: 0.2214


Epoch 72/100: 100%|██████████| 270/270 [01:02<00:00,  4.30it/s]


Epoch 72/100, Train Loss: 0.0399, Train Perplexity: 1.0407, Train Accuracy: 0.9899


Epoch 73/100:  22%|██▏       | 60/270 [00:15<03:09,  1.11it/s]

Validation Loss: 11.6691, Validation Perplexity: 125841.5935, Validation Accuracy: 0.2207


Epoch 73/100: 100%|██████████| 270/270 [01:02<00:00,  4.35it/s]


Epoch 73/100, Train Loss: 0.0393, Train Perplexity: 1.0401, Train Accuracy: 0.9899


Epoch 74/100:  15%|█▍        | 40/270 [00:10<03:19,  1.15it/s]

Validation Loss: 11.6965, Validation Perplexity: 129262.0700, Validation Accuracy: 0.2213


Epoch 74/100: 100%|██████████| 270/270 [01:01<00:00,  4.42it/s]


Epoch 74/100, Train Loss: 0.0390, Train Perplexity: 1.0398, Train Accuracy: 0.9900


Epoch 75/100:   7%|▋         | 20/270 [00:06<03:39,  1.14it/s]

Validation Loss: 11.7056, Validation Perplexity: 130360.6576, Validation Accuracy: 0.2211


Epoch 75/100: 100%|██████████| 270/270 [01:04<00:00,  4.18it/s]


Validation Loss: 11.7121, Validation Perplexity: 131298.3748, Validation Accuracy: 0.2223
Epoch 75/100, Train Loss: 0.0387, Train Perplexity: 1.0395, Train Accuracy: 0.9900


Epoch 76/100:  93%|█████████▎| 250/270 [00:58<00:18,  1.11it/s]

Validation Loss: 11.7192, Validation Perplexity: 132264.1641, Validation Accuracy: 0.2209


Epoch 76/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 76/100, Train Loss: 0.0385, Train Perplexity: 1.0392, Train Accuracy: 0.9900


Epoch 77/100:  85%|████████▌ | 230/270 [00:54<00:36,  1.11it/s]

Validation Loss: 11.7326, Validation Perplexity: 134189.5801, Validation Accuracy: 0.2217


Epoch 77/100: 100%|██████████| 270/270 [01:03<00:00,  4.28it/s]


Epoch 77/100, Train Loss: 0.0382, Train Perplexity: 1.0389, Train Accuracy: 0.9900


Epoch 78/100:  78%|███████▊  | 210/270 [00:48<00:52,  1.14it/s]

Validation Loss: 11.7431, Validation Perplexity: 135488.3504, Validation Accuracy: 0.2215


Epoch 78/100: 100%|██████████| 270/270 [01:01<00:00,  4.37it/s]


Epoch 78/100, Train Loss: 0.0379, Train Perplexity: 1.0386, Train Accuracy: 0.9901


Epoch 79/100:  70%|███████   | 190/270 [00:44<01:11,  1.12it/s]

Validation Loss: 11.7436, Validation Perplexity: 135650.0601, Validation Accuracy: 0.2216


Epoch 79/100: 100%|██████████| 270/270 [01:01<00:00,  4.36it/s]


Epoch 79/100, Train Loss: 0.0377, Train Perplexity: 1.0384, Train Accuracy: 0.9901


Epoch 80/100:  63%|██████▎   | 170/270 [00:40<01:29,  1.12it/s]

Validation Loss: 11.7765, Validation Perplexity: 140148.9638, Validation Accuracy: 0.2213


Epoch 80/100: 100%|██████████| 270/270 [01:02<00:00,  4.30it/s]


Epoch 80/100, Train Loss: 0.0373, Train Perplexity: 1.0380, Train Accuracy: 0.9901


Epoch 81/100:  56%|█████▌    | 150/270 [00:35<01:46,  1.12it/s]

Validation Loss: 11.7913, Validation Perplexity: 142153.8237, Validation Accuracy: 0.2211


Epoch 81/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 81/100, Train Loss: 0.0371, Train Perplexity: 1.0378, Train Accuracy: 0.9901


Epoch 82/100:  48%|████▊     | 130/270 [00:31<02:07,  1.10it/s]

Validation Loss: 11.7957, Validation Perplexity: 142903.3324, Validation Accuracy: 0.2213


Epoch 82/100: 100%|██████████| 270/270 [01:02<00:00,  4.29it/s]


Epoch 82/100, Train Loss: 0.0368, Train Perplexity: 1.0375, Train Accuracy: 0.9902


Epoch 83/100:  41%|████      | 110/270 [00:26<02:24,  1.11it/s]

Validation Loss: 11.8040, Validation Perplexity: 144040.6863, Validation Accuracy: 0.2213


Epoch 83/100: 100%|██████████| 270/270 [01:02<00:00,  4.29it/s]


Epoch 83/100, Train Loss: 0.0365, Train Perplexity: 1.0372, Train Accuracy: 0.9902


Epoch 84/100:  33%|███▎      | 90/270 [00:22<02:42,  1.11it/s]

Validation Loss: 11.8209, Validation Perplexity: 146559.1499, Validation Accuracy: 0.2212


Epoch 84/100: 100%|██████████| 270/270 [01:03<00:00,  4.28it/s]


Epoch 84/100, Train Loss: 0.0363, Train Perplexity: 1.0369, Train Accuracy: 0.9902


Epoch 85/100:  26%|██▌       | 70/270 [00:17<02:58,  1.12it/s]

Validation Loss: 11.8253, Validation Perplexity: 147160.1859, Validation Accuracy: 0.2214


Epoch 85/100: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s]


Epoch 85/100, Train Loss: 0.0360, Train Perplexity: 1.0367, Train Accuracy: 0.9903


Epoch 86/100:  19%|█▊        | 50/270 [00:13<03:19,  1.10it/s]

Validation Loss: 11.8263, Validation Perplexity: 147502.9613, Validation Accuracy: 0.2211


Epoch 86/100: 100%|██████████| 270/270 [01:02<00:00,  4.29it/s]


Epoch 86/100, Train Loss: 0.0358, Train Perplexity: 1.0364, Train Accuracy: 0.9903


Epoch 87/100:  11%|█         | 30/270 [00:08<03:35,  1.11it/s]

Validation Loss: 11.8354, Validation Perplexity: 148756.7946, Validation Accuracy: 0.2214


Epoch 87/100: 100%|██████████| 270/270 [01:02<00:00,  4.30it/s]


Epoch 87/100, Train Loss: 0.0355, Train Perplexity: 1.0362, Train Accuracy: 0.9903


Epoch 88/100:   4%|▎         | 10/270 [00:04<03:59,  1.08it/s]

Validation Loss: 11.8379, Validation Perplexity: 149025.3862, Validation Accuracy: 0.2212


Epoch 88/100:  96%|█████████▋| 260/270 [01:02<00:08,  1.12it/s]

Validation Loss: 11.8370, Validation Perplexity: 149060.9678, Validation Accuracy: 0.2212


Epoch 88/100: 100%|██████████| 270/270 [01:04<00:00,  4.20it/s]


Epoch 88/100, Train Loss: 0.0353, Train Perplexity: 1.0359, Train Accuracy: 0.9903


Epoch 89/100:  89%|████████▉ | 240/270 [00:56<00:26,  1.11it/s]

Validation Loss: 11.8395, Validation Perplexity: 149265.5824, Validation Accuracy: 0.2212


Epoch 89/100: 100%|██████████| 270/270 [01:02<00:00,  4.31it/s]


Epoch 89/100, Train Loss: 0.0351, Train Perplexity: 1.0357, Train Accuracy: 0.9904


Epoch 90/100:  81%|████████▏ | 220/270 [00:51<00:45,  1.10it/s]

Validation Loss: 11.8547, Validation Perplexity: 151625.0844, Validation Accuracy: 0.2213


Epoch 90/100: 100%|██████████| 270/270 [01:02<00:00,  4.29it/s]


Epoch 90/100, Train Loss: 0.0349, Train Perplexity: 1.0355, Train Accuracy: 0.9904


Epoch 91/100:  74%|███████▍  | 200/270 [00:47<01:03,  1.11it/s]

Validation Loss: 11.8576, Validation Perplexity: 152033.4095, Validation Accuracy: 0.2212


Epoch 91/100: 100%|██████████| 270/270 [01:02<00:00,  4.29it/s]


Epoch 91/100, Train Loss: 0.0347, Train Perplexity: 1.0353, Train Accuracy: 0.9904


Epoch 92/100:  67%|██████▋   | 180/270 [00:42<01:20,  1.12it/s]

Validation Loss: 11.8638, Validation Perplexity: 152991.2512, Validation Accuracy: 0.2212


Epoch 92/100: 100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Epoch 92/100, Train Loss: 0.0345, Train Perplexity: 1.0352, Train Accuracy: 0.9904


Epoch 93/100:  59%|█████▉    | 160/270 [00:38<01:38,  1.12it/s]

Validation Loss: 11.8716, Validation Perplexity: 154189.1885, Validation Accuracy: 0.2211


Epoch 93/100: 100%|██████████| 270/270 [01:02<00:00,  4.30it/s]


Epoch 93/100, Train Loss: 0.0344, Train Perplexity: 1.0350, Train Accuracy: 0.9904


Epoch 94/100:  52%|█████▏    | 140/270 [00:33<01:55,  1.12it/s]

Validation Loss: 11.8692, Validation Perplexity: 153877.5750, Validation Accuracy: 0.2213


Epoch 94/100: 100%|██████████| 270/270 [01:02<00:00,  4.35it/s]


Epoch 94/100, Train Loss: 0.0342, Train Perplexity: 1.0348, Train Accuracy: 0.9904


Epoch 95/100:  44%|████▍     | 120/270 [00:28<02:13,  1.12it/s]

Validation Loss: 11.8731, Validation Perplexity: 154485.9192, Validation Accuracy: 0.2212


Epoch 95/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 95/100, Train Loss: 0.0341, Train Perplexity: 1.0347, Train Accuracy: 0.9905


Epoch 96/100:  37%|███▋      | 100/270 [00:24<02:33,  1.11it/s]

Validation Loss: 11.8721, Validation Perplexity: 154311.6565, Validation Accuracy: 0.2211


Epoch 96/100: 100%|██████████| 270/270 [01:03<00:00,  4.28it/s]


Epoch 96/100, Train Loss: 0.0340, Train Perplexity: 1.0346, Train Accuracy: 0.9906


Epoch 97/100:  30%|██▉       | 80/270 [00:20<02:51,  1.11it/s]

Validation Loss: 11.8758, Validation Perplexity: 154893.1067, Validation Accuracy: 0.2212


Epoch 97/100: 100%|██████████| 270/270 [01:02<00:00,  4.29it/s]


Epoch 97/100, Train Loss: 0.0339, Train Perplexity: 1.0345, Train Accuracy: 0.9907


Epoch 98/100:  22%|██▏       | 60/270 [00:15<03:07,  1.12it/s]

Validation Loss: 11.8759, Validation Perplexity: 154899.4399, Validation Accuracy: 0.2211


Epoch 98/100: 100%|██████████| 270/270 [01:02<00:00,  4.31it/s]


Epoch 98/100, Train Loss: 0.0339, Train Perplexity: 1.0344, Train Accuracy: 0.9909


Epoch 99/100:  15%|█▍        | 40/270 [00:11<03:27,  1.11it/s]

Validation Loss: 11.8766, Validation Perplexity: 155004.1848, Validation Accuracy: 0.2211


Epoch 99/100: 100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Epoch 99/100, Train Loss: 0.0338, Train Perplexity: 1.0344, Train Accuracy: 0.9911


Epoch 100/100:   7%|▋         | 20/270 [00:06<03:38,  1.15it/s]

Validation Loss: 11.8771, Validation Perplexity: 155076.7694, Validation Accuracy: 0.2211


Epoch 100/100: 100%|██████████| 270/270 [01:03<00:00,  4.25it/s]

Validation Loss: 11.8772, Validation Perplexity: 155089.2476, Validation Accuracy: 0.2211
Epoch 100/100, Train Loss: 0.0338, Train Perplexity: 1.0344, Train Accuracy: 0.9912





VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▁▁▂▃▃▄▅▆▆▇▇█████████████████████████████
train_loss,█▇▆▆▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_perplexity,█▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▅██▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,▂▁▁▁▂▂▃▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇█████████████████
val_perplexity,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▅▅▆▆▆▆▆▇▇▇▇▇███████

0,1
train_accuracy,0.99121
train_loss,0.03378
train_perplexity,1.03436
val_accuracy,0.22109
val_loss,11.87717
val_perplexity,155089.24755
