# Dataset

In [1]:
class Dataset:

    def __init__(self, device='cpu'):
        self.device = device
        self._ind = 0

    def get_batch(self, batch_size, train=True):
        x, y = self.get_batch_np(batch_size, train=train)
        x = torch.from_numpy(x).to(device=self.device, dtype=torch.float32)
        y = torch.from_numpy(y).to(device=self.device, dtype=torch.long)
        self._ind += 1
        return x, y

    def get_batch_np(self, batch_size, train):
        raise NotImplementedError

    def start_epoch(self):
        self._ind = 0

In [2]:
!pip install einops



In [3]:
from einops import rearrange
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms


In [4]:
class CIFAR10GrayDataset(Dataset):

    def __init__(self, batch_size, patch_size=None, data_aug=True, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.batch_size = batch_size  # we fix it so we can use dataloader
        self.patch_size = patch_size  # grid of (patch_size x patch_size)

        if data_aug:
            transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Grayscale(num_output_channels=1),
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(20),
                transforms.Normalize((0.5,), (0.5,)),
            ])
        else:
            transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Grayscale(num_output_channels=1),
                transforms.Normalize((0.5,), (0.5,)),
            ])

        val_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Grayscale(num_output_channels=1),
            transforms.Normalize((0.5,), (0.5,)),
        ])

        self.d_train = DataLoader(
            torchvision.datasets.CIFAR10('data/cifar', download=True, train=True, transform=transform),
            batch_size=batch_size, drop_last=True, shuffle=True,
        )
        self.d_test = DataLoader(
            torchvision.datasets.CIFAR10('data/cifar', download=True, train=False, transform=val_transform),
            batch_size=batch_size, drop_last=True, shuffle=True,
        )

        self.train_enum = enumerate(self.d_train)
        self.test_enum = enumerate(self.d_test)

    def get_batch(self, batch_size=None, train=True):
        if train:
            _, (x, y) = next(self.train_enum, (None, (None, None)))
            if x is None:
                self.train_enum = enumerate(self.d_train)
                _, (x, y) = next(self.train_enum)
        else:
            _, (x, y) = next(self.test_enum, (None, (None, None)))
            if x is None:
                self.test_enum = enumerate(self.d_test)
                _, (x, y) = next(self.test_enum)

        if self.patch_size is not None:
            x = rearrange(x, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=self.patch_size, p2=self.patch_size)

        x = x.to(device=self.device)
        y = y.to(device=self.device)

        self._ind += 1

        return x, y

# I - FPT - Universal Computation

In [5]:
import torch
import torch.nn as nn

In [6]:
class FPT(nn.Module):

    def __init__(
            self,
            input_dim,
            output_dim,
            model_name='gpt2',
            pretrained=False,
            return_last_only=True,
            use_embeddings_for_in=False,
            in_layer_sizes=None,
            out_layer_sizes=None,
            freeze_trans=True,
            freeze_in=False,
            freeze_pos=False,
            freeze_ln=False,
            freeze_attn=True,
            freeze_ff=True,
            freeze_out=False,
            dropout=0.1,
            orth_gain=1.41,
    ):
        super().__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.model_name = model_name
        self.return_last_only = return_last_only
        self.use_embeddings_for_in = use_embeddings_for_in

        self.in_layer_sizes = [] if in_layer_sizes is None else in_layer_sizes
        self.out_layer_sizes = [] if out_layer_sizes is None else out_layer_sizes
        self.dropout = dropout

        if 'gpt' in model_name:
            assert model_name in ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']

            from transformers import GPT2Model

            pretrained_transformer = GPT2Model.from_pretrained(model_name)
            if pretrained:
                self.transformer = pretrained_transformer
            else:
                self.transformer = GPT2Model(pretrained_transformer.config)

            if model_name == 'gpt2':
                embedding_size = 768
            elif model_name == 'gpt2-medium':
                embedding_size = 1024
            elif model_name == 'gpt2-large':
                embedding_size = 1280
            elif model_name == 'gpt2-xl':
                embedding_size = 1600

        else:
            raise NotImplementedError('model_name not implemented')

        if use_embeddings_for_in:
            self.in_net = nn.Embedding(input_dim, embedding_size)
        else:
            in_layers = []
            last_output_size = input_dim
            for size in self.in_layer_sizes:
                layer = nn.Linear(last_output_size, size)
                if orth_gain is not None:
                    torch.nn.init.orthogonal_(layer.weight, gain=orth_gain)
                layer.bias.data.zero_()

                in_layers.append(layer)
                in_layers.append(nn.ReLU())
                in_layers.append(nn.Dropout(dropout))
                last_output_size = size

            final_linear = nn.Linear(last_output_size, embedding_size)
            if orth_gain is not None:
                torch.nn.init.orthogonal_(final_linear.weight, gain=orth_gain)
            final_linear.bias.data.zero_()

            in_layers.append(final_linear)
            in_layers.append(nn.Dropout(dropout))

            self.in_net = nn.Sequential(*in_layers)

        out_layers = []
        last_output_size = embedding_size
        for size in self.out_layer_sizes:
            out_layers.append(nn.Linear(last_output_size, size))
            out_layers.append(nn.ReLU())
            out_layers.append(nn.Dropout(dropout))
            last_output_size = size
        out_layers.append(nn.Linear(last_output_size, output_dim))
        self.out_net = nn.Sequential(*out_layers)

        if freeze_trans:
            for name, p in self.transformer.named_parameters():
                name = name.lower()
                if 'ln' in name:
                    p.requires_grad = not freeze_ln
                elif 'wpe' in name:
                    p.requires_grad = not freeze_pos
                elif 'mlp' in name:
                    p.requires_grad = not freeze_ff
                elif 'attn' in name:
                    p.requires_grad = not freeze_attn
                else:
                    p.requires_grad = False
        if freeze_in:
            for p in self.in_net.parameters():
                p.requires_grad = False
        if freeze_out:
            for p in self.out_net.parameters():
                p.requires_grad = False

    def forward(self, x, output_attentions=False):

        orig_dim = x.shape[-1]
        if orig_dim != self.input_dim and not self.use_embeddings_for_in:
            if orig_dim % self.input_dim != 0:
                raise ValueError('dimension of x must be divisible by patch size')
            ratio = orig_dim // self.input_dim
            x = x.reshape(x.shape[0], x.shape[1] * ratio, self.input_dim)
        else:
            ratio = 1

        x = self.in_net(x)

        transformer_outputs = self.transformer(
            inputs_embeds=x,
            return_dict=True,
            output_attentions=output_attentions,
        )
        x = transformer_outputs.last_hidden_state

        if self.return_last_only:
            x = x[:,-ratio:]

        x = self.out_net(x)
        if self.return_last_only and ratio > 1:
            x = x.reshape(x.shape[0], x.shape[1] // ratio, ratio * self.output_dim)

        if output_attentions:
            return x, transformer_outputs.attentions
        else:
            return x

# II - Trainer - Universal Computation

In [7]:
import torch
from tqdm import tqdm

import time

In [8]:
class Trainer:

    def __init__(
            self,
            model,
            dataset,
            loss_fn,
            accuracy_fn=None,
            steps_per_epoch=100,
            test_steps_per_epoch=20,
            learning_rate=1e-3,
            batch_size=2,
            eval_batch_size=8,
            grad_accumulate=1,
    ):
        self.model = model
        self.dataset = dataset
        self.loss_fn = loss_fn
        self.acc_fn = accuracy_fn
        self.steps_per_epoch = steps_per_epoch
        self.test_steps_per_epoch = test_steps_per_epoch
        self.batch_size = batch_size
        self.eval_batch_size = eval_batch_size
        self.grad_accumulate = grad_accumulate

        self.optim = torch.optim.Adam(model.parameters(), lr=learning_rate)

        self.diagnostics = {'Gradient Steps': 0}

    def get_loss(self, x, y, return_acc=False):
        out = self.model(x)
        loss = self.loss_fn(out, y, x=x)
        if return_acc:
            if self.acc_fn is None:
                raise NotImplementedError('accuracy function not specified')
            accs = self.acc_fn(
                out.detach().cpu().numpy(),
                y.detach().cpu().numpy(),
                x=x.detach().cpu().numpy(),
            )
            return loss, accs
        return loss

    def train_epoch(self, test_steps=None):
        self.dataset.start_epoch()

        train_losses, tr_accuracy = [], 0.
        self.model.train()
        start_train_time = time.time()
        for _ in tqdm(range(self.steps_per_epoch)):
            step_loss = 0
            for _ in range(self.grad_accumulate):
                x, y = self.dataset.get_batch(self.batch_size, train=True)
                loss, acc = self.get_loss(x, y, return_acc=True)
                loss = loss / self.grad_accumulate
                loss.backward()
                step_loss += loss.detach().cpu().item()
                tr_accuracy += acc

            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.)
            self.optim.step()
            self.optim.zero_grad()

            self.diagnostics['Gradient Steps'] += 1

            train_losses.append(step_loss)
        end_train_time = time.time()

        test_steps = self.test_steps_per_epoch if test_steps is None else test_steps

        test_loss, accuracy = 0., 0.
        self.model.eval()
        start_test_time = time.time()
        with torch.no_grad():
            for _ in range(test_steps):
                x, y = self.dataset.get_batch(self.eval_batch_size, train=False)
                loss, acc = self.get_loss(x, y, return_acc=True)
                test_loss += loss.detach().cpu().item() / test_steps
                accuracy += acc / test_steps
        end_test_time = time.time()

        self.diagnostics['Average Train Loss'] = sum(train_losses) / self.steps_per_epoch
        self.diagnostics['Start Train Loss'] = train_losses[0]
        self.diagnostics['Final Train Loss'] = train_losses[-1]
        self.diagnostics['Test Loss'] = test_loss
        self.diagnostics['Test Accuracy'] = accuracy
        self.diagnostics['Train Accuracy'] = tr_accuracy / (self.steps_per_epoch * self.grad_accumulate)
        self.diagnostics['Time Training'] = end_train_time - start_train_time
        self.diagnostics['Time Testing'] = end_test_time - start_test_time

# III - Experiment. run_experiment 

In [9]:
!pip install wandb




In [10]:
import numpy as np
import torch
import wandb

import argparse
from datetime import datetime
import random
import sys



In [11]:
def experiment(
        exp_name,
        exp_args,
        **kwargs
):

    """
    Preliminary checks
    """

    # Must be able to accumulate gradient if batch size is large
    assert 'batch_size' in kwargs
    assert kwargs['batch_size'] <= exp_args['gpu_batch_size'] or \
           kwargs['batch_size'] % exp_args['gpu_batch_size'] == 0

    """
    Create dataset, model, and trainer
    """

    task = kwargs['task']
    batch_size = kwargs['batch_size']
    patch_size = kwargs['patch_size']
    device = exp_args['device']

    return_last_only = True

    if task == 'bit-memory':
        from universal_computation.datasets.bit_memory import BitMemoryDataset
        dataset = BitMemoryDataset(n=kwargs['n'], num_patterns=kwargs['num_patterns'], device=device)
        input_dim = kwargs['n'] if patch_size is None else patch_size
        output_dim = 2*kwargs['n'] if patch_size is None else 2 * patch_size
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'bit-xor':
        from universal_computation.datasets.bit_xor import BitXORDataset
        dataset = BitXORDataset(n=kwargs['n'], num_patterns=kwargs['num_patterns'], device=device)
        input_dim = kwargs['n'] if patch_size is None else patch_size
        output_dim = 2 * kwargs['n'] if patch_size is None else 2 * patch_size
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'mnist':
        #from universal_computation.datasets.mnist import MNISTDataset
        dataset = MNISTDataset(batch_size=batch_size, patch_size=patch_size, device=device)
        input_dim, output_dim = patch_size ** 2, 10
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'cifar10':
        # from universal_computation.datasets.cifar10 import CIFAR10Dataset
        dataset = CIFAR10Dataset(batch_size=batch_size, patch_size=patch_size, device=device)
        input_dim, output_dim = 3 * patch_size**2, 10
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'cifar10-gray':
        # from universal_computation.datasets.cifar10_gray import CIFAR10GrayDataset
        dataset = CIFAR10GrayDataset(batch_size=batch_size, patch_size=patch_size, device=device)
        input_dim, output_dim = patch_size**2, 10
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'listops':
        from universal_computation.datasets.listops import ListopsDataset
        dataset = ListopsDataset(batch_size=batch_size, device=device)
        input_dim, output_dim = 15, 10
        use_embeddings = True
        experiment_type = 'classification'
    else:
        raise NotImplementedError('dataset not implemented')

    if 'bit' in task:

        ce_loss = torch.nn.CrossEntropyLoss()

        def loss_fn(out, y, x=None):
            out = torch.reshape(out, (-1, kwargs['n'], 2))
            ids = torch.zeros(y.shape).to(device=y.device).long()
            if task == 'bit-memory':
                ids[y < 0], ids[y > 0] = 0, 1
            else:
                ids[y < 0.5], ids[y > 0.5] = 0, 1
            out, ids = torch.reshape(out, (-1, 2)), torch.reshape(ids, (-1,))
            return ce_loss(out, ids)

        def accuracy_fn(preds, true, x=None):
            if task == 'bit-memory':
                preds = preds.reshape(-1, kwargs['n'], 2).argmax(-1) * 2 - 1
            else:
                preds = preds.reshape(-1, kwargs['n'], 2).argmax(-1)
            if task == 'bit-memory':
                return (np.sign(preds) == np.sign(true)).mean()
            else:
                return ((preds > 0.5) == (true > 0.5)).mean()

    elif experiment_type == 'classification':

        ce_loss = torch.nn.CrossEntropyLoss()

        def loss_fn(out, y, x=None):
            out = out[:, 0]
            return ce_loss(out, y)

        def accuracy_fn(preds, true, x=None):
            preds = preds[:, 0].argmax(-1)
            return (preds == true).mean()

    else:
        raise NotImplementedError('experiment_type not recognized')

    model = FPT(
        input_dim=input_dim,
        output_dim=output_dim,
        model_name=kwargs.get('model_name', 'gpt2'),
        pretrained=kwargs.get('pretrained', True),
        return_last_only=return_last_only,
        use_embeddings_for_in=use_embeddings,
        in_layer_sizes=kwargs.get('in_layer_sizes', None),
        out_layer_sizes=kwargs.get('out_layer_sizes', None),
        freeze_trans=kwargs.get('freeze_trans', True),
        freeze_in=kwargs.get('freeze_in', False),
        freeze_pos=kwargs.get('freeze_pos', False),
        freeze_ln=kwargs.get('freeze_ln', False),
        freeze_attn=kwargs.get('freeze_attn', True),
        freeze_ff=kwargs.get('freeze_ff', True),
        freeze_out=kwargs.get('freeze_out', False),
        dropout=kwargs['dropout'],
        orth_gain=kwargs['orth_gain'],
    )
    model.to(device)

    gpu_batch_size = exp_args['gpu_batch_size']
    trainer = Trainer(
        model,
        dataset,
        loss_fn=loss_fn,
        accuracy_fn=accuracy_fn,
        steps_per_epoch=exp_args['steps_per_iter'],
        test_steps_per_epoch=exp_args['test_steps_per_iter'],
        learning_rate=kwargs['learning_rate'],
        batch_size=gpu_batch_size if batch_size > gpu_batch_size else batch_size,
        eval_batch_size=batch_size,
        grad_accumulate=batch_size // gpu_batch_size if batch_size > gpu_batch_size else 1,
    )

    """
    Set up logging
    """

    log_to_wandb = exp_args['log_to_wandb']
    save_models = exp_args['save_models']
    wandb_project = exp_args['wandb_project']

    short_name = str(random.randint(int(1e5), int(1e6) - 1))
    run_name = f'{exp_name}-{task}-{short_name}'

    if log_to_wandb:
        config = dict(
            short_name=short_name,
            run_name=run_name,
            **exp_args,
            **kwargs,
        )
        wandb.init(
            name=f'{exp_name}-{short_name}',
            group=f'{exp_name}-{task}',
            project=wandb_project,
            config=config,
        )
        wandb.watch(model)

    for t in range(exp_args['num_iters']):
        trainer.train_epoch()

        print('=' * 57)
        print(f'| Iteration {" " * 15} | {t+1:25} |')
        for k, v in trainer.diagnostics.items():
            print(f'| {k:25} | {v:25} |')

        if log_to_wandb:
            wandb.log(trainer.diagnostics)

        if save_models and ((t+1) % exp_args['save_models_every'] == 0 or
                            (t+1) == exp_args['num_iters']):
            with open(f'models/{run_name}.pt', 'wb') as f:
                state_dict = dict(model=model.state_dict(), optim=trainer.optim.state_dict())
                torch.save(state_dict, f)
            print(f'Saved model at {t+1} iters: {run_name}')


In [12]:
def run_experiment(
        exp_name,
        experiment_params,
):
    parser = argparse.ArgumentParser()

    parser.add_argument('--num_iters', '-it', type=int, default=110,
                        help='Number of iterations for trainer')
    parser.add_argument('--steps_per_iter', type=int, default=100,
                        help='Number of gradient steps per iteration')
    parser.add_argument('--test_steps_per_iter', type=int, default=25,
                        help='Number of test gradient steps per iteration')

    parser.add_argument('--log_to_wandb', '-w', type=bool, default=False,
                        help='Whether or not to log to Weights and Biases')
    parser.add_argument('--note', '-n', type=str, default='',
                        help='An optional note to be logged to W&B')
    parser.add_argument('--wandb_project', type=str, default='my_project',
                        help='Project name for W&B')
    parser.add_argument('--include_date', type=bool, default=True,
                        help='Whether to include date in run name')

    parser.add_argument('--save_models', '-s', type=bool, default=False,
                        help='Whether or not to save the model files locally')
    parser.add_argument('--save_models_every', '-int', type=int, default=25,
                        help='How often to save models locally')

    parser.add_argument('--device', '-d', type=str, default='cuda',
                        help='Which device for Pytorch to use')
    parser.add_argument('--gpu_batch_size', '-gbs', type=int, default=16,
                        help='Max batch size to put on GPU (used for gradient accumulation)')

    #exp_args = parser.parse_args(sys.argv[1:])
    exp_args, unknown = parser.parse_known_args(sys.argv[1:])

    if exp_args.include_date:
        timestamp = datetime.now().strftime('%m-%d')
        exp_name = f'{timestamp}-{exp_name}'

    experiment_params['exp_name'] = exp_name
    experiment_params['exp_args'] = vars(exp_args)

    experiment(xp_name=exp_name, **experiment_params)

# Run.

In [13]:
experiment_params0 = dict(
        task='cifar10-gray',
        n=1000,                # ignored if not a bit task
        num_patterns=5,        # ignored if not a bit task
        patch_size=32,

        model_name='gpt2',
        pretrained=True,

        freeze_trans=True,     # if False, we don't check arguments other than in and out
        freeze_in=False,
        freeze_pos=False,
        freeze_ln=False,
        freeze_attn=True,
        freeze_ff=True,
        freeze_out=False,

        in_layer_sizes=None,   # not in paper, but can specify layer sizes for an MLP,
        out_layer_sizes=None,  # ex. [32, 32] creates a 2-layer MLP with dimension 32

        learning_rate=1e-3,
        batch_size=2,
        dropout=0.1,
        orth_gain=1.41,
    )

In [14]:
experiment_name0 = f'fpt' 


In [15]:
!pip install transformers



In [16]:
run_experiment(experiment_name0, experiment_params0)

Files already downloaded and verified
Files already downloaded and verified


Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [00:02<00:00, 35.04it/s]
  4%|▍         | 4/100 [00:00<00:02, 32.06it/s]

| Iteration                 |                         1 |
| Gradient Steps            |                       100 |
| Average Train Loss        |         2.645882935523987 |
| Start Train Loss          |         4.812505722045898 |
| Final Train Loss          |        1.8105145692825317 |
| Test Loss                 |        2.4886891603469845 |
| Test Accuracy             |       0.12000000000000001 |
| Train Accuracy            |                     0.135 |
| Time Training             |         2.857307195663452 |
| Time Testing              |        0.2382643222808838 |


100%|██████████| 100/100 [00:02<00:00, 35.54it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.33it/s]

| Iteration                 |                         2 |
| Gradient Steps            |                       200 |
| Average Train Loss        |        2.3739624547958376 |
| Start Train Loss          |         2.503535270690918 |
| Final Train Loss          |        3.0753324031829834 |
| Test Loss                 |        2.6741058707237246 |
| Test Accuracy             |       0.23999999999999996 |
| Train Accuracy            |                     0.125 |
| Time Training             |           2.8166663646698 |
| Time Testing              |       0.24466729164123535 |


100%|██████████| 100/100 [00:02<00:00, 36.44it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.63it/s]

| Iteration                 |                         3 |
| Gradient Steps            |                       300 |
| Average Train Loss        |         2.338146630525589 |
| Start Train Loss          |        1.9995319843292236 |
| Final Train Loss          |         2.454047679901123 |
| Test Loss                 |        2.5621062254905698 |
| Test Accuracy             |       0.12000000000000001 |
| Train Accuracy            |                      0.17 |
| Time Training             |        2.7557642459869385 |
| Time Testing              |       0.23114633560180664 |


100%|██████████| 100/100 [00:02<00:00, 36.18it/s]
  4%|▍         | 4/100 [00:00<00:02, 37.25it/s]

| Iteration                 |                         4 |
| Gradient Steps            |                       400 |
| Average Train Loss        |        2.3096116292476654 |
| Start Train Loss          |        1.8446614742279053 |
| Final Train Loss          |        3.4982450008392334 |
| Test Loss                 |         2.802972335815429 |
| Test Accuracy             |                      0.14 |
| Train Accuracy            |                     0.175 |
| Time Training             |        2.7713119983673096 |
| Time Testing              |       0.23065876960754395 |


100%|██████████| 100/100 [00:02<00:00, 36.68it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.85it/s]

| Iteration                 |                         5 |
| Gradient Steps            |                       500 |
| Average Train Loss        |        2.2971023499965666 |
| Start Train Loss          |         2.829151153564453 |
| Final Train Loss          |         2.495321035385132 |
| Test Loss                 |        2.3111687374114993 |
| Test Accuracy             |                      0.16 |
| Train Accuracy            |                     0.175 |
| Time Training             |        2.7316884994506836 |
| Time Testing              |       0.24458646774291992 |


100%|██████████| 100/100 [00:02<00:00, 36.35it/s]
  4%|▍         | 4/100 [00:00<00:02, 38.00it/s]

| Iteration                 |                         6 |
| Gradient Steps            |                       600 |
| Average Train Loss        |        2.2687993413209915 |
| Start Train Loss          |        2.0229687690734863 |
| Final Train Loss          |        1.6498713493347168 |
| Test Loss                 |        2.5513503527641292 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                     0.185 |
| Time Training             |         2.756284475326538 |
| Time Testing              |       0.23298215866088867 |


100%|██████████| 100/100 [00:02<00:00, 36.29it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.77it/s]

| Iteration                 |                         7 |
| Gradient Steps            |                       700 |
| Average Train Loss        |        2.3011453664302826 |
| Start Train Loss          |        1.8925561904907227 |
| Final Train Loss          |         2.114800214767456 |
| Test Loss                 |        2.2290212535858154 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                      0.16 |
| Time Training             |        2.7602014541625977 |
| Time Testing              |        0.2405543327331543 |


100%|██████████| 100/100 [00:02<00:00, 36.17it/s]
  4%|▍         | 4/100 [00:00<00:02, 37.95it/s]

| Iteration                 |                         8 |
| Gradient Steps            |                       800 |
| Average Train Loss        |          2.25887863278389 |
| Start Train Loss          |        2.9236743450164795 |
| Final Train Loss          |        1.9945186376571655 |
| Test Loss                 |        2.0612112307548522 |
| Test Accuracy             |       0.27999999999999997 |
| Train Accuracy            |                      0.18 |
| Time Training             |         2.770341157913208 |
| Time Testing              |        0.2369225025177002 |


100%|██████████| 100/100 [00:02<00:00, 36.11it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.90it/s]

| Iteration                 |                         9 |
| Gradient Steps            |                       900 |
| Average Train Loss        |        2.3117300987243654 |
| Start Train Loss          |         2.229954242706299 |
| Final Train Loss          |        2.4421093463897705 |
| Test Loss                 |        2.3568111467361446 |
| Test Accuracy             |       0.18000000000000002 |
| Train Accuracy            |                     0.145 |
| Time Training             |         2.774366855621338 |
| Time Testing              |        0.2370316982269287 |


100%|██████████| 100/100 [00:02<00:00, 36.04it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.61it/s]

| Iteration                 |                        10 |
| Gradient Steps            |                      1000 |
| Average Train Loss        |         2.322510107755661 |
| Start Train Loss          |        2.4362998008728027 |
| Final Train Loss          |        1.9046717882156372 |
| Test Loss                 |         2.306186513900757 |
| Test Accuracy             |                      0.08 |
| Train Accuracy            |                     0.155 |
| Time Training             |        2.7802648544311523 |
| Time Testing              |       0.23766875267028809 |


100%|██████████| 100/100 [00:02<00:00, 35.80it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.61it/s]

| Iteration                 |                        11 |
| Gradient Steps            |                      1100 |
| Average Train Loss        |        2.2059460520744323 |
| Start Train Loss          |        1.6635091304779053 |
| Final Train Loss          |        1.3710278272628784 |
| Test Loss                 |        1.9796078991889952 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                     0.175 |
| Time Training             |        2.7992136478424072 |
| Time Testing              |       0.23285913467407227 |


100%|██████████| 100/100 [00:02<00:00, 35.83it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.22it/s]

| Iteration                 |                        12 |
| Gradient Steps            |                      1200 |
| Average Train Loss        |         2.260838078260422 |
| Start Train Loss          |        2.4690041542053223 |
| Final Train Loss          |          2.48429012298584 |
| Test Loss                 |        2.1162473535537716 |
| Test Accuracy             |       0.30000000000000004 |
| Train Accuracy            |                      0.17 |
| Time Training             |        2.7977869510650635 |
| Time Testing              |         0.232924222946167 |


100%|██████████| 100/100 [00:02<00:00, 35.95it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.08it/s]

| Iteration                 |                        13 |
| Gradient Steps            |                      1300 |
| Average Train Loss        |        2.2910974740982057 |
| Start Train Loss          |        1.8362667560577393 |
| Final Train Loss          |        2.1535606384277344 |
| Test Loss                 |        2.2779451012611385 |
| Test Accuracy             |                      0.22 |
| Train Accuracy            |                      0.14 |
| Time Training             |        2.7915303707122803 |
| Time Testing              |        0.2413187026977539 |


100%|██████████| 100/100 [00:02<00:00, 35.74it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.18it/s]

| Iteration                 |                        14 |
| Gradient Steps            |                      1400 |
| Average Train Loss        |         2.167046141028404 |
| Start Train Loss          |         2.194967746734619 |
| Final Train Loss          |        1.8205729722976685 |
| Test Loss                 |         2.534727132320404 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                     0.195 |
| Time Training             |         2.802382469177246 |
| Time Testing              |       0.23257946968078613 |


100%|██████████| 100/100 [00:02<00:00, 35.24it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.41it/s]

| Iteration                 |                        15 |
| Gradient Steps            |                      1500 |
| Average Train Loss        |         2.271037598848343 |
| Start Train Loss          |         2.747692584991455 |
| Final Train Loss          |        2.2114856243133545 |
| Test Loss                 |         2.135786590576172 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                     0.145 |
| Time Training             |         2.843503952026367 |
| Time Testing              |       0.24110937118530273 |


100%|██████████| 100/100 [00:02<00:00, 35.96it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.92it/s]

| Iteration                 |                        16 |
| Gradient Steps            |                      1600 |
| Average Train Loss        |        2.2741131806373596 |
| Start Train Loss          |        2.8870058059692383 |
| Final Train Loss          |        2.4176506996154785 |
| Test Loss                 |         2.290003182888031 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                      0.17 |
| Time Training             |         2.786557197570801 |
| Time Testing              |       0.23874497413635254 |


100%|██████████| 100/100 [00:02<00:00, 35.69it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.95it/s]

| Iteration                 |                        17 |
| Gradient Steps            |                      1700 |
| Average Train Loss        |         2.269492814540863 |
| Start Train Loss          |        1.9768863916397095 |
| Final Train Loss          |        2.3906025886535645 |
| Test Loss                 |        2.2670818901062004 |
| Test Accuracy             |       0.18000000000000002 |
| Train Accuracy            |                      0.19 |
| Time Training             |        2.8074758052825928 |
| Time Testing              |        0.2550320625305176 |


100%|██████████| 100/100 [00:02<00:00, 36.14it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.17it/s]

| Iteration                 |                        18 |
| Gradient Steps            |                      1800 |
| Average Train Loss        |         2.203081418275833 |
| Start Train Loss          |        2.0371031761169434 |
| Final Train Loss          |         1.788914442062378 |
| Test Loss                 |         2.072101205587387 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                     0.165 |
| Time Training             |        2.7740838527679443 |
| Time Testing              |       0.23258662223815918 |


100%|██████████| 100/100 [00:02<00:00, 36.19it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.18it/s]

| Iteration                 |                        19 |
| Gradient Steps            |                      1900 |
| Average Train Loss        |        2.1936324191093446 |
| Start Train Loss          |         2.096357822418213 |
| Final Train Loss          |        1.9741966724395752 |
| Test Loss                 |        2.4030790519714347 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                      0.23 |
| Time Training             |         2.768664836883545 |
| Time Testing              |       0.24535918235778809 |


100%|██████████| 100/100 [00:02<00:00, 36.10it/s]
  3%|▎         | 3/100 [00:00<00:03, 27.33it/s]

| Iteration                 |                        20 |
| Gradient Steps            |                      2000 |
| Average Train Loss        |         2.238876931667328 |
| Start Train Loss          |           1.7469482421875 |
| Final Train Loss          |        2.3893625736236572 |
| Test Loss                 |        2.2344256877899173 |
| Test Accuracy             |                      0.24 |
| Train Accuracy            |                     0.175 |
| Time Training             |         2.775583505630493 |
| Time Testing              |        0.2543027400970459 |


100%|██████████| 100/100 [00:02<00:00, 35.58it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.33it/s]

| Iteration                 |                        21 |
| Gradient Steps            |                      2100 |
| Average Train Loss        |         2.254288744926453 |
| Start Train Loss          |        2.2939319610595703 |
| Final Train Loss          |         2.005244255065918 |
| Test Loss                 |        2.6563669729232786 |
| Test Accuracy             |                      0.16 |
| Train Accuracy            |                     0.175 |
| Time Training             |          2.81596040725708 |
| Time Testing              |        0.2412879467010498 |


100%|██████████| 100/100 [00:02<00:00, 36.39it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.75it/s]

| Iteration                 |                        22 |
| Gradient Steps            |                      2200 |
| Average Train Loss        |        2.1691240429878236 |
| Start Train Loss          |        1.9315630197525024 |
| Final Train Loss          |        2.1603994369506836 |
| Test Loss                 |        2.7898154115676874 |
| Test Accuracy             |                      0.16 |
| Train Accuracy            |                     0.215 |
| Time Training             |         2.755169630050659 |
| Time Testing              |        0.2438664436340332 |


100%|██████████| 100/100 [00:02<00:00, 35.93it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.35it/s]

| Iteration                 |                        23 |
| Gradient Steps            |                      2300 |
| Average Train Loss        |          2.22134064078331 |
| Start Train Loss          |         2.373073101043701 |
| Final Train Loss          |        1.6073038578033447 |
| Test Loss                 |        2.3773392438888554 |
| Test Accuracy             |                      0.08 |
| Train Accuracy            |                     0.205 |
| Time Training             |        2.7882516384124756 |
| Time Testing              |       0.24565625190734863 |


100%|██████████| 100/100 [00:02<00:00, 36.14it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.47it/s]

| Iteration                 |                        24 |
| Gradient Steps            |                      2400 |
| Average Train Loss        |        2.2583391219377518 |
| Start Train Loss          |        2.3704662322998047 |
| Final Train Loss          |         1.991047739982605 |
| Test Loss                 |        2.6689360666275017 |
| Test Accuracy             |                      0.16 |
| Train Accuracy            |                     0.165 |
| Time Training             |        2.7744967937469482 |
| Time Testing              |       0.23984527587890625 |


100%|██████████| 100/100 [00:02<00:00, 36.13it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.92it/s]

| Iteration                 |                        25 |
| Gradient Steps            |                      2500 |
| Average Train Loss        |         2.162347584962845 |
| Start Train Loss          |        2.1604442596435547 |
| Final Train Loss          |         2.453040599822998 |
| Test Loss                 |        2.2305891418457033 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                      0.22 |
| Time Training             |        2.7731499671936035 |
| Time Testing              |          0.24798583984375 |


100%|██████████| 100/100 [00:02<00:00, 36.06it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.72it/s]

| Iteration                 |                        26 |
| Gradient Steps            |                      2600 |
| Average Train Loss        |        2.1996207571029665 |
| Start Train Loss          |          1.95452082157135 |
| Final Train Loss          |        2.6939921379089355 |
| Test Loss                 |         2.167660064697266 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                      0.16 |
| Time Training             |         2.781365394592285 |
| Time Testing              |        0.2413468360900879 |


100%|██████████| 100/100 [00:02<00:00, 35.62it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.92it/s]

| Iteration                 |                        27 |
| Gradient Steps            |                      2700 |
| Average Train Loss        |        2.2682075238227846 |
| Start Train Loss          |        1.6735289096832275 |
| Final Train Loss          |         2.498828887939453 |
| Test Loss                 |        2.5203600597381595 |
| Test Accuracy             |                      0.18 |
| Train Accuracy            |                     0.175 |
| Time Training             |         2.817067861557007 |
| Time Testing              |       0.24135661125183105 |


100%|██████████| 100/100 [00:02<00:00, 35.69it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.49it/s]

| Iteration                 |                        28 |
| Gradient Steps            |                      2800 |
| Average Train Loss        |         2.182557212114334 |
| Start Train Loss          |         2.362682819366455 |
| Final Train Loss          |        1.3438562154769897 |
| Test Loss                 |        2.8584206223487856 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                      0.26 |
| Time Training             |        2.8079304695129395 |
| Time Testing              |       0.23690509796142578 |


100%|██████████| 100/100 [00:02<00:00, 36.43it/s]
  4%|▍         | 4/100 [00:00<00:02, 37.72it/s]

| Iteration                 |                        29 |
| Gradient Steps            |                      2900 |
| Average Train Loss        |         2.189615523815155 |
| Start Train Loss          |        2.0972609519958496 |
| Final Train Loss          |         3.629573345184326 |
| Test Loss                 |         2.388915238380432 |
| Test Accuracy             |                      0.14 |
| Train Accuracy            |                     0.185 |
| Time Training             |        2.7493202686309814 |
| Time Testing              |        0.2396247386932373 |


100%|██████████| 100/100 [00:02<00:00, 36.93it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.15it/s]

| Iteration                 |                        30 |
| Gradient Steps            |                      3000 |
| Average Train Loss        |        2.2359867668151856 |
| Start Train Loss          |        1.6785385608673096 |
| Final Train Loss          |         2.611752510070801 |
| Test Loss                 |        2.1792561483383177 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                     0.205 |
| Time Training             |        2.7152934074401855 |
| Time Testing              |       0.24918580055236816 |


100%|██████████| 100/100 [00:02<00:00, 36.72it/s]
  4%|▍         | 4/100 [00:00<00:02, 37.37it/s]

| Iteration                 |                        31 |
| Gradient Steps            |                      3100 |
| Average Train Loss        |        2.1855787670612337 |
| Start Train Loss          |        1.8197771310806274 |
| Final Train Loss          |        2.6539554595947266 |
| Test Loss                 |         2.130982375144958 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                     0.215 |
| Time Training             |         2.731339454650879 |
| Time Testing              |        0.2306833267211914 |


100%|██████████| 100/100 [00:02<00:00, 36.80it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.35it/s]

| Iteration                 |                        32 |
| Gradient Steps            |                      3200 |
| Average Train Loss        |         2.283121178150177 |
| Start Train Loss          |         2.188366651535034 |
| Final Train Loss          |         2.156938314437866 |
| Test Loss                 |         2.479635510444641 |
| Test Accuracy             |                      0.18 |
| Train Accuracy            |                     0.165 |
| Time Training             |        2.7252111434936523 |
| Time Testing              |       0.23459696769714355 |


100%|██████████| 100/100 [00:02<00:00, 36.09it/s]
  4%|▍         | 4/100 [00:00<00:02, 37.04it/s]

| Iteration                 |                        33 |
| Gradient Steps            |                      3300 |
| Average Train Loss        |        2.1901206851005552 |
| Start Train Loss          |         2.275463581085205 |
| Final Train Loss          |        2.6207098960876465 |
| Test Loss                 |         2.636864650249481 |
| Test Accuracy             |                      0.16 |
| Train Accuracy            |                     0.245 |
| Time Training             |         2.776820421218872 |
| Time Testing              |       0.22707819938659668 |


100%|██████████| 100/100 [00:02<00:00, 36.16it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.88it/s]

| Iteration                 |                        34 |
| Gradient Steps            |                      3400 |
| Average Train Loss        |         2.212764856815338 |
| Start Train Loss          |         2.080061674118042 |
| Final Train Loss          |         1.546886682510376 |
| Test Loss                 |          2.10429826259613 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                     0.225 |
| Time Training             |         2.769848585128784 |
| Time Testing              |       0.23479533195495605 |


100%|██████████| 100/100 [00:02<00:00, 36.45it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.08it/s]

| Iteration                 |                        35 |
| Gradient Steps            |                      3500 |
| Average Train Loss        |        2.2183572125434874 |
| Start Train Loss          |         2.813340663909912 |
| Final Train Loss          |        2.2576751708984375 |
| Test Loss                 |         2.430772096514703 |
| Test Accuracy             |                       0.2 |
| Train Accuracy            |                      0.19 |
| Time Training             |         2.749971389770508 |
| Time Testing              |        0.2542991638183594 |


100%|██████████| 100/100 [00:02<00:00, 36.45it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.79it/s]

| Iteration                 |                        36 |
| Gradient Steps            |                      3600 |
| Average Train Loss        |        2.1275227802991865 |
| Start Train Loss          |        1.6581158638000488 |
| Final Train Loss          |        1.2849290370941162 |
| Test Loss                 |        2.4462856721878055 |
| Test Accuracy             |       0.27999999999999997 |
| Train Accuracy            |                     0.275 |
| Time Training             |        2.7527995109558105 |
| Time Testing              |       0.24526500701904297 |


100%|██████████| 100/100 [00:02<00:00, 36.20it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.24it/s]

| Iteration                 |                        37 |
| Gradient Steps            |                      3700 |
| Average Train Loss        |        2.2508746671676634 |
| Start Train Loss          |         2.065647602081299 |
| Final Train Loss          |          2.98764967918396 |
| Test Loss                 |        2.2634175217151644 |
| Test Accuracy             |                      0.28 |
| Train Accuracy            |                      0.23 |
| Time Training             |         2.768244981765747 |
| Time Testing              |        0.2619915008544922 |


100%|██████████| 100/100 [00:02<00:00, 36.14it/s]
  4%|▍         | 4/100 [00:00<00:02, 32.08it/s]

| Iteration                 |                        38 |
| Gradient Steps            |                      3800 |
| Average Train Loss        |        2.2006780207157135 |
| Start Train Loss          |         2.715872287750244 |
| Final Train Loss          |        2.2717909812927246 |
| Test Loss                 |         2.109160668849945 |
| Test Accuracy             |       0.27999999999999997 |
| Train Accuracy            |                      0.18 |
| Time Training             |         2.771510124206543 |
| Time Testing              |       0.24451518058776855 |


100%|██████████| 100/100 [00:02<00:00, 35.43it/s]
  4%|▍         | 4/100 [00:00<00:02, 37.50it/s]

| Iteration                 |                        39 |
| Gradient Steps            |                      3900 |
| Average Train Loss        |         2.196721712350845 |
| Start Train Loss          |        2.1999144554138184 |
| Final Train Loss          |         2.406341552734375 |
| Test Loss                 |        1.9394382858276367 |
| Test Accuracy             |       0.30000000000000004 |
| Train Accuracy            |                     0.165 |
| Time Training             |         2.827911853790283 |
| Time Testing              |        0.2398977279663086 |


100%|██████████| 100/100 [00:02<00:00, 35.91it/s]
  4%|▍         | 4/100 [00:00<00:02, 32.17it/s]

| Iteration                 |                        40 |
| Gradient Steps            |                      4000 |
| Average Train Loss        |        2.1304596209526063 |
| Start Train Loss          |        2.3954622745513916 |
| Final Train Loss          |         2.557487964630127 |
| Test Loss                 |        2.1700922846794124 |
| Test Accuracy             |       0.38000000000000006 |
| Train Accuracy            |                      0.22 |
| Time Training             |          2.79433012008667 |
| Time Testing              |       0.25708889961242676 |


100%|██████████| 100/100 [00:02<00:00, 35.82it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.37it/s]

| Iteration                 |                        41 |
| Gradient Steps            |                      4100 |
| Average Train Loss        |        2.2082726550102234 |
| Start Train Loss          |         2.215074300765991 |
| Final Train Loss          |         2.541057825088501 |
| Test Loss                 |        2.3750239801406856 |
| Test Accuracy             |       0.23999999999999996 |
| Train Accuracy            |                      0.22 |
| Time Training             |        2.7977333068847656 |
| Time Testing              |       0.24617815017700195 |


100%|██████████| 100/100 [00:02<00:00, 35.83it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.32it/s]

| Iteration                 |                        42 |
| Gradient Steps            |                      4200 |
| Average Train Loss        |        2.2190084075927734 |
| Start Train Loss          |        2.7863643169403076 |
| Final Train Loss          |        2.2624905109405518 |
| Test Loss                 |        2.4296737694740296 |
| Test Accuracy             |       0.12000000000000001 |
| Train Accuracy            |                      0.21 |
| Time Training             |        2.8045496940612793 |
| Time Testing              |       0.23267412185668945 |


100%|██████████| 100/100 [00:02<00:00, 35.29it/s]
  4%|▍         | 4/100 [00:00<00:02, 37.16it/s]

| Iteration                 |                        43 |
| Gradient Steps            |                      4300 |
| Average Train Loss        |        2.1971159780025484 |
| Start Train Loss          |        2.4548535346984863 |
| Final Train Loss          |        2.1345629692077637 |
| Test Loss                 |        2.7646272897720334 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                     0.195 |
| Time Training             |        2.8387186527252197 |
| Time Testing              |       0.23954033851623535 |


100%|██████████| 100/100 [00:02<00:00, 35.43it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.51it/s]

| Iteration                 |                        44 |
| Gradient Steps            |                      4400 |
| Average Train Loss        |        2.2234770894050597 |
| Start Train Loss          |        2.0994908809661865 |
| Final Train Loss          |        1.9920432567596436 |
| Test Loss                 |         2.737583446502686 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                      0.16 |
| Time Training             |         2.829803228378296 |
| Time Testing              |       0.24743890762329102 |


100%|██████████| 100/100 [00:02<00:00, 35.58it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.87it/s]

| Iteration                 |                        45 |
| Gradient Steps            |                      4500 |
| Average Train Loss        |         2.162864862680435 |
| Start Train Loss          |        2.3950765132904053 |
| Final Train Loss          |        1.3917291164398193 |
| Test Loss                 |         2.189793524742126 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                     0.195 |
| Time Training             |        2.8146088123321533 |
| Time Testing              |       0.24054503440856934 |


100%|██████████| 100/100 [00:02<00:00, 36.12it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.42it/s]

| Iteration                 |                        46 |
| Gradient Steps            |                      4600 |
| Average Train Loss        |        2.1323746448755263 |
| Start Train Loss          |         1.457202434539795 |
| Final Train Loss          |        1.8557775020599365 |
| Test Loss                 |         2.536188831329346 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                      0.26 |
| Time Training             |         2.778595209121704 |
| Time Testing              |       0.24315810203552246 |


100%|██████████| 100/100 [00:02<00:00, 35.66it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.88it/s]

| Iteration                 |                        47 |
| Gradient Steps            |                      4700 |
| Average Train Loss        |         2.219093246459961 |
| Start Train Loss          |        1.9058396816253662 |
| Final Train Loss          |        2.5742220878601074 |
| Test Loss                 |        2.3501688575744626 |
| Test Accuracy             |                      0.22 |
| Train Accuracy            |                     0.185 |
| Time Training             |         2.813446044921875 |
| Time Testing              |       0.23690128326416016 |


100%|██████████| 100/100 [00:02<00:00, 35.66it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.74it/s]

| Iteration                 |                        48 |
| Gradient Steps            |                      4800 |
| Average Train Loss        |         2.211244661808014 |
| Start Train Loss          |        1.4602179527282715 |
| Final Train Loss          |         2.397953510284424 |
| Test Loss                 |         2.418232927322388 |
| Test Accuracy             |       0.23999999999999996 |
| Train Accuracy            |                     0.185 |
| Time Training             |         2.816075325012207 |
| Time Testing              |        0.2305905818939209 |


100%|██████████| 100/100 [00:02<00:00, 35.93it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.06it/s]

| Iteration                 |                        49 |
| Gradient Steps            |                      4900 |
| Average Train Loss        |        2.1376241379976273 |
| Start Train Loss          |         2.065903902053833 |
| Final Train Loss          |        2.8789665699005127 |
| Test Loss                 |          2.84266592502594 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                     0.195 |
| Time Training             |         2.793421745300293 |
| Time Testing              |       0.23368310928344727 |


100%|██████████| 100/100 [00:02<00:00, 35.64it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.25it/s]

| Iteration                 |                        50 |
| Gradient Steps            |                      5000 |
| Average Train Loss        |         2.199597226381302 |
| Start Train Loss          |        1.2022924423217773 |
| Final Train Loss          |        2.9338245391845703 |
| Test Loss                 |          2.28547212600708 |
| Test Accuracy             |                      0.38 |
| Train Accuracy            |                     0.215 |
| Time Training             |        2.8109657764434814 |
| Time Testing              |       0.23531389236450195 |


100%|██████████| 100/100 [00:02<00:00, 35.73it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.99it/s]

| Iteration                 |                        51 |
| Gradient Steps            |                      5100 |
| Average Train Loss        |        2.1909935426712037 |
| Start Train Loss          |        2.6835741996765137 |
| Final Train Loss          |         1.371513843536377 |
| Test Loss                 |         2.548904476165772 |
| Test Accuracy             |                      0.28 |
| Train Accuracy            |                     0.205 |
| Time Training             |        2.8031599521636963 |
| Time Testing              |       0.23929047584533691 |


100%|██████████| 100/100 [00:02<00:00, 35.57it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.58it/s]

| Iteration                 |                        52 |
| Gradient Steps            |                      5200 |
| Average Train Loss        |        2.1761698710918425 |
| Start Train Loss          |         2.051152467727661 |
| Final Train Loss          |        1.9337663650512695 |
| Test Loss                 |        2.3566620540618897 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                      0.26 |
| Time Training             |        2.8208954334259033 |
| Time Testing              |       0.23969650268554688 |


100%|██████████| 100/100 [00:02<00:00, 35.78it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.10it/s]

| Iteration                 |                        53 |
| Gradient Steps            |                      5300 |
| Average Train Loss        |         2.262581794261932 |
| Start Train Loss          |         2.682070255279541 |
| Final Train Loss          |        2.4672107696533203 |
| Test Loss                 |         2.212363679409027 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                      0.17 |
| Time Training             |         2.799553871154785 |
| Time Testing              |       0.23827266693115234 |


100%|██████████| 100/100 [00:02<00:00, 35.69it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.04it/s]

| Iteration                 |                        54 |
| Gradient Steps            |                      5400 |
| Average Train Loss        |        2.1407048082351685 |
| Start Train Loss          |        1.9819800853729248 |
| Final Train Loss          |         2.137488842010498 |
| Test Loss                 |        2.6083231544494634 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                      0.19 |
| Time Training             |        2.8111119270324707 |
| Time Testing              |       0.22902393341064453 |


100%|██████████| 100/100 [00:02<00:00, 35.58it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.96it/s]

| Iteration                 |                        55 |
| Gradient Steps            |                      5500 |
| Average Train Loss        |        2.1829128748178483 |
| Start Train Loss          |         2.243098497390747 |
| Final Train Loss          |         3.759328842163086 |
| Test Loss                 |         2.432276356220245 |
| Test Accuracy             |                      0.24 |
| Train Accuracy            |                     0.195 |
| Time Training             |        2.8163516521453857 |
| Time Testing              |       0.24916934967041016 |


100%|██████████| 100/100 [00:02<00:00, 36.11it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.18it/s]

| Iteration                 |                        56 |
| Gradient Steps            |                      5600 |
| Average Train Loss        |        2.1685845017433167 |
| Start Train Loss          |        1.8733563423156738 |
| Final Train Loss          |        2.5492186546325684 |
| Test Loss                 |         2.683977091312409 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                      0.22 |
| Time Training             |        2.7765724658966064 |
| Time Testing              |       0.23512578010559082 |


100%|██████████| 100/100 [00:02<00:00, 35.94it/s]
  4%|▍         | 4/100 [00:00<00:02, 37.02it/s]

| Iteration                 |                        57 |
| Gradient Steps            |                      5700 |
| Average Train Loss        |         2.195691388845444 |
| Start Train Loss          |         2.308884620666504 |
| Final Train Loss          |        2.4637222290039062 |
| Test Loss                 |         2.247919833660126 |
| Test Accuracy             |        0.3400000000000001 |
| Train Accuracy            |                      0.17 |
| Time Training             |        2.7892093658447266 |
| Time Testing              |       0.23566031455993652 |


100%|██████████| 100/100 [00:02<00:00, 36.45it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.03it/s]

| Iteration                 |                        58 |
| Gradient Steps            |                      5800 |
| Average Train Loss        |        2.0998019474744796 |
| Start Train Loss          |        1.9263026714324951 |
| Final Train Loss          |         2.875389575958252 |
| Test Loss                 |        2.4148949432373046 |
| Test Accuracy             |       0.23999999999999996 |
| Train Accuracy            |                      0.24 |
| Time Training             |        2.7463154792785645 |
| Time Testing              |       0.24546074867248535 |


100%|██████████| 100/100 [00:02<00:00, 35.78it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.04it/s]

| Iteration                 |                        59 |
| Gradient Steps            |                      5900 |
| Average Train Loss        |        2.2241573190689086 |
| Start Train Loss          |         2.404897689819336 |
| Final Train Loss          |        2.2676467895507812 |
| Test Loss                 |         2.389392898082733 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                      0.17 |
| Time Training             |        2.8040268421173096 |
| Time Testing              |       0.24378323554992676 |


100%|██████████| 100/100 [00:02<00:00, 35.83it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.47it/s]

| Iteration                 |                        60 |
| Gradient Steps            |                      6000 |
| Average Train Loss        |         2.202934814095497 |
| Start Train Loss          |        2.3014960289001465 |
| Final Train Loss          |        1.8745167255401611 |
| Test Loss                 |        2.6128573155403134 |
| Test Accuracy             |       0.12000000000000001 |
| Train Accuracy            |                     0.185 |
| Time Training             |        2.8005335330963135 |
| Time Testing              |        0.2451467514038086 |


100%|██████████| 100/100 [00:02<00:00, 36.14it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.83it/s]

| Iteration                 |                        61 |
| Gradient Steps            |                      6100 |
| Average Train Loss        |        2.1555035465955736 |
| Start Train Loss          |         2.273037910461426 |
| Final Train Loss          |        1.9564204216003418 |
| Test Loss                 |        2.4963204610347747 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                     0.245 |
| Time Training             |         2.772491455078125 |
| Time Testing              |       0.23744916915893555 |


100%|██████████| 100/100 [00:02<00:00, 35.61it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.31it/s]

| Iteration                 |                        62 |
| Gradient Steps            |                      6200 |
| Average Train Loss        |         2.174736411571503 |
| Start Train Loss          |        2.5737459659576416 |
| Final Train Loss          |         2.607745885848999 |
| Test Loss                 |         2.503615937232971 |
| Test Accuracy             |                      0.24 |
| Train Accuracy            |                       0.2 |
| Time Training             |         2.811516046524048 |
| Time Testing              |        0.2388298511505127 |


100%|██████████| 100/100 [00:02<00:00, 35.70it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.72it/s]

| Iteration                 |                        63 |
| Gradient Steps            |                      6300 |
| Average Train Loss        |        2.1001059198379517 |
| Start Train Loss          |        2.5878396034240723 |
| Final Train Loss          |        1.8543423414230347 |
| Test Loss                 |        2.6734821915626528 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                     0.255 |
| Time Training             |        2.8063933849334717 |
| Time Testing              |       0.24248456954956055 |


100%|██████████| 100/100 [00:02<00:00, 35.53it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.64it/s]

| Iteration                 |                        64 |
| Gradient Steps            |                      6400 |
| Average Train Loss        |        2.1575864601135253 |
| Start Train Loss          |         2.207064628601074 |
| Final Train Loss          |          1.90606689453125 |
| Test Loss                 |         2.325276486873627 |
| Test Accuracy             |        0.3400000000000001 |
| Train Accuracy            |                     0.215 |
| Time Training             |         2.824432611465454 |
| Time Testing              |       0.23199939727783203 |


100%|██████████| 100/100 [00:02<00:00, 36.03it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.13it/s]

| Iteration                 |                        65 |
| Gradient Steps            |                      6500 |
| Average Train Loss        |        2.1051373755931855 |
| Start Train Loss          |        1.6084034442901611 |
| Final Train Loss          |        2.3578052520751953 |
| Test Loss                 |        2.8078825306892394 |
| Test Accuracy             |                      0.16 |
| Train Accuracy            |                      0.25 |
| Time Training             |         2.780060291290283 |
| Time Testing              |       0.25988292694091797 |


100%|██████████| 100/100 [00:02<00:00, 35.93it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.00it/s]

| Iteration                 |                        66 |
| Gradient Steps            |                      6600 |
| Average Train Loss        |         2.119601411819458 |
| Start Train Loss          |        1.9619426727294922 |
| Final Train Loss          |        1.6952662467956543 |
| Test Loss                 |        2.3266695880889894 |
| Test Accuracy             |                      0.34 |
| Train Accuracy            |                      0.25 |
| Time Training             |        2.7937397956848145 |
| Time Testing              |        0.2510530948638916 |


100%|██████████| 100/100 [00:02<00:00, 35.59it/s]
  4%|▍         | 4/100 [00:00<00:02, 38.34it/s]

| Iteration                 |                        67 |
| Gradient Steps            |                      6700 |
| Average Train Loss        |        2.1322435039281844 |
| Start Train Loss          |         2.058985710144043 |
| Final Train Loss          |         1.851833701133728 |
| Test Loss                 |         2.227235851287842 |
| Test Accuracy             |                      0.24 |
| Train Accuracy            |                     0.235 |
| Time Training             |         2.813215494155884 |
| Time Testing              |       0.24437975883483887 |


100%|██████████| 100/100 [00:02<00:00, 35.65it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.76it/s]

| Iteration                 |                        68 |
| Gradient Steps            |                      6800 |
| Average Train Loss        |        2.1882353937625885 |
| Start Train Loss          |         2.114030122756958 |
| Final Train Loss          |        2.7788643836975098 |
| Test Loss                 |        2.4059400796890262 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                      0.22 |
| Time Training             |        2.8158116340637207 |
| Time Testing              |       0.24566364288330078 |


100%|██████████| 100/100 [00:02<00:00, 35.24it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.59it/s]

| Iteration                 |                        69 |
| Gradient Steps            |                      6900 |
| Average Train Loss        |         2.008237315416336 |
| Start Train Loss          |         1.900207281112671 |
| Final Train Loss          |        2.1592791080474854 |
| Test Loss                 |         2.509457905292512 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                      0.26 |
| Time Training             |        2.8410542011260986 |
| Time Testing              |       0.23951292037963867 |


100%|██████████| 100/100 [00:02<00:00, 35.51it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.40it/s]

| Iteration                 |                        70 |
| Gradient Steps            |                      7000 |
| Average Train Loss        |        2.2000901597738265 |
| Start Train Loss          |        1.9913058280944824 |
| Final Train Loss          |        2.1394691467285156 |
| Test Loss                 |         2.398206329345703 |
| Test Accuracy             |                      0.18 |
| Train Accuracy            |                     0.225 |
| Time Training             |        2.8202435970306396 |
| Time Testing              |       0.24513459205627441 |


100%|██████████| 100/100 [00:02<00:00, 35.48it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.00it/s]

| Iteration                 |                        71 |
| Gradient Steps            |                      7100 |
| Average Train Loss        |         2.179785622358322 |
| Start Train Loss          |        1.9052221775054932 |
| Final Train Loss          |        2.5916290283203125 |
| Test Loss                 |         2.373664684295654 |
| Test Accuracy             |                      0.26 |
| Train Accuracy            |                      0.18 |
| Time Training             |         2.825857162475586 |
| Time Testing              |        0.2337334156036377 |


100%|██████████| 100/100 [00:02<00:00, 35.18it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.79it/s]

| Iteration                 |                        72 |
| Gradient Steps            |                      7200 |
| Average Train Loss        |        2.1168078446388243 |
| Start Train Loss          |         1.402785301208496 |
| Final Train Loss          |        2.4826250076293945 |
| Test Loss                 |         2.314706926345825 |
| Test Accuracy             |       0.23999999999999996 |
| Train Accuracy            |                     0.235 |
| Time Training             |         2.846168279647827 |
| Time Testing              |       0.23997807502746582 |


100%|██████████| 100/100 [00:02<00:00, 35.40it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.78it/s]

| Iteration                 |                        73 |
| Gradient Steps            |                      7300 |
| Average Train Loss        |         2.111396734714508 |
| Start Train Loss          |        2.2939674854278564 |
| Final Train Loss          |        2.4095680713653564 |
| Test Loss                 |        2.8601467561721803 |
| Test Accuracy             |                      0.18 |
| Train Accuracy            |                      0.23 |
| Time Training             |        2.8305344581604004 |
| Time Testing              |       0.24770498275756836 |


100%|██████████| 100/100 [00:02<00:00, 35.24it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.97it/s]

| Iteration                 |                        74 |
| Gradient Steps            |                      7400 |
| Average Train Loss        |        2.1299198949337006 |
| Start Train Loss          |        1.9966003894805908 |
| Final Train Loss          |        2.5335474014282227 |
| Test Loss                 |         2.686529272794723 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                      0.23 |
| Time Training             |        2.8397111892700195 |
| Time Testing              |       0.24036812782287598 |


100%|██████████| 100/100 [00:02<00:00, 35.66it/s]
  4%|▍         | 4/100 [00:00<00:02, 32.69it/s]

| Iteration                 |                        75 |
| Gradient Steps            |                      7500 |
| Average Train Loss        |         2.172837427854538 |
| Start Train Loss          |        2.8923676013946533 |
| Final Train Loss          |        1.8022761344909668 |
| Test Loss                 |         2.643463065624237 |
| Test Accuracy             |                      0.18 |
| Train Accuracy            |                     0.215 |
| Time Training             |         2.810549736022949 |
| Time Testing              |        0.2421705722808838 |


100%|██████████| 100/100 [00:02<00:00, 35.72it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.81it/s]

| Iteration                 |                        76 |
| Gradient Steps            |                      7600 |
| Average Train Loss        |        2.2122940111160276 |
| Start Train Loss          |         2.503080129623413 |
| Final Train Loss          |        2.1663191318511963 |
| Test Loss                 |        2.2065637183189395 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                     0.205 |
| Time Training             |         2.809408187866211 |
| Time Testing              |       0.23714113235473633 |


100%|██████████| 100/100 [00:02<00:00, 36.01it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.05it/s]

| Iteration                 |                        77 |
| Gradient Steps            |                      7700 |
| Average Train Loss        |        2.2312893760204315 |
| Start Train Loss          |         2.316753387451172 |
| Final Train Loss          |        1.8297263383865356 |
| Test Loss                 |        2.5126018118858333 |
| Test Accuracy             |                      0.14 |
| Train Accuracy            |                      0.17 |
| Time Training             |        2.7825658321380615 |
| Time Testing              |       0.24324345588684082 |


100%|██████████| 100/100 [00:02<00:00, 35.62it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.94it/s]

| Iteration                 |                        78 |
| Gradient Steps            |                      7800 |
| Average Train Loss        |        2.0858201867341997 |
| Start Train Loss          |        1.3624743223190308 |
| Final Train Loss          |        0.8624216914176941 |
| Test Loss                 |         2.542958676815033 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                     0.225 |
| Time Training             |         2.816554307937622 |
| Time Testing              |       0.23579144477844238 |


100%|██████████| 100/100 [00:02<00:00, 35.80it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.44it/s]

| Iteration                 |                        79 |
| Gradient Steps            |                      7900 |
| Average Train Loss        |        2.2120411157608033 |
| Start Train Loss          |        1.7705678939819336 |
| Final Train Loss          |        2.1115708351135254 |
| Test Loss                 |         2.615715103149414 |
| Test Accuracy             |                      0.26 |
| Train Accuracy            |                       0.2 |
| Time Training             |        2.8032984733581543 |
| Time Testing              |       0.23781752586364746 |


100%|██████████| 100/100 [00:02<00:00, 35.61it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.17it/s]

| Iteration                 |                        80 |
| Gradient Steps            |                      8000 |
| Average Train Loss        |        2.1603540563583374 |
| Start Train Loss          |        1.6426693201065063 |
| Final Train Loss          |        2.6489505767822266 |
| Test Loss                 |        2.1006441712379456 |
| Test Accuracy             |                      0.32 |
| Train Accuracy            |                       0.2 |
| Time Training             |         2.812765598297119 |
| Time Testing              |       0.24549341201782227 |


100%|██████████| 100/100 [00:02<00:00, 35.34it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.86it/s]

| Iteration                 |                        81 |
| Gradient Steps            |                      8100 |
| Average Train Loss        |        2.1215005934238436 |
| Start Train Loss          |        1.6315038204193115 |
| Final Train Loss          |        1.7447595596313477 |
| Test Loss                 |         2.430278372764587 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                     0.195 |
| Time Training             |        2.8391904830932617 |
| Time Testing              |        0.2558436393737793 |


100%|██████████| 100/100 [00:02<00:00, 35.60it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.97it/s]

| Iteration                 |                        82 |
| Gradient Steps            |                      8200 |
| Average Train Loss        |         2.119558072090149 |
| Start Train Loss          |        1.7102997303009033 |
| Final Train Loss          |        1.8919049501419067 |
| Test Loss                 |        2.5403637886047368 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                      0.23 |
| Time Training             |        2.8129684925079346 |
| Time Testing              |       0.24795913696289062 |


100%|██████████| 100/100 [00:02<00:00, 35.29it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.20it/s]

| Iteration                 |                        83 |
| Gradient Steps            |                      8300 |
| Average Train Loss        |        2.1080944073200225 |
| Start Train Loss          |        1.2295424938201904 |
| Final Train Loss          |         2.519840717315674 |
| Test Loss                 |        2.5480384635925293 |
| Test Accuracy             |                      0.14 |
| Train Accuracy            |                     0.245 |
| Time Training             |         2.836573362350464 |
| Time Testing              |        0.2401268482208252 |


100%|██████████| 100/100 [00:02<00:00, 35.54it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.39it/s]

| Iteration                 |                        84 |
| Gradient Steps            |                      8400 |
| Average Train Loss        |         2.125350435376167 |
| Start Train Loss          |        2.0827553272247314 |
| Final Train Loss          |        1.1859779357910156 |
| Test Loss                 |        2.5690013504028326 |
| Test Accuracy             |                      0.22 |
| Train Accuracy            |                      0.26 |
| Time Training             |         2.819669246673584 |
| Time Testing              |       0.23553967475891113 |


100%|██████████| 100/100 [00:02<00:00, 35.90it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.04it/s]

| Iteration                 |                        85 |
| Gradient Steps            |                      8500 |
| Average Train Loss        |        2.1971763277053835 |
| Start Train Loss          |        2.3320252895355225 |
| Final Train Loss          |        2.2987194061279297 |
| Test Loss                 |         2.239783430099487 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                     0.205 |
| Time Training             |        2.7945995330810547 |
| Time Testing              |        0.2420825958251953 |


100%|██████████| 100/100 [00:02<00:00, 35.58it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.13it/s]

| Iteration                 |                        86 |
| Gradient Steps            |                      8600 |
| Average Train Loss        |         2.124688855409622 |
| Start Train Loss          |         1.915977954864502 |
| Final Train Loss          |        1.5896661281585693 |
| Test Loss                 |        2.4355559253692634 |
| Test Accuracy             |                      0.18 |
| Train Accuracy            |                     0.195 |
| Time Training             |        2.8140976428985596 |
| Time Testing              |        0.2424318790435791 |


100%|██████████| 100/100 [00:02<00:00, 35.68it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.04it/s]

| Iteration                 |                        87 |
| Gradient Steps            |                      8700 |
| Average Train Loss        |        2.2084632778167723 |
| Start Train Loss          |        2.9968159198760986 |
| Final Train Loss          |        1.6710610389709473 |
| Test Loss                 |        2.5317699956893924 |
| Test Accuracy             |       0.23999999999999996 |
| Train Accuracy            |                     0.215 |
| Time Training             |         2.810957670211792 |
| Time Testing              |       0.23022770881652832 |


100%|██████████| 100/100 [00:02<00:00, 35.68it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.06it/s]

| Iteration                 |                        88 |
| Gradient Steps            |                      8800 |
| Average Train Loss        |         2.095934418439865 |
| Start Train Loss          |         1.815148115158081 |
| Final Train Loss          |        1.9285533428192139 |
| Test Loss                 |         2.125902466773987 |
| Test Accuracy             |       0.27999999999999997 |
| Train Accuracy            |                     0.255 |
| Time Training             |        2.8116512298583984 |
| Time Testing              |       0.23826026916503906 |


100%|██████████| 100/100 [00:02<00:00, 35.63it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.68it/s]

| Iteration                 |                        89 |
| Gradient Steps            |                      8900 |
| Average Train Loss        |        2.1596566569805145 |
| Start Train Loss          |        1.9988560676574707 |
| Final Train Loss          |         3.060967445373535 |
| Test Loss                 |         2.323751390576362 |
| Test Accuracy             |                      0.24 |
| Train Accuracy            |                      0.22 |
| Time Training             |        2.8115272521972656 |
| Time Testing              |       0.24282431602478027 |


100%|██████████| 100/100 [00:02<00:00, 35.26it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.65it/s]

| Iteration                 |                        90 |
| Gradient Steps            |                      9000 |
| Average Train Loss        |         2.070956683754921 |
| Start Train Loss          |        1.8326339721679688 |
| Final Train Loss          |        1.6146767139434814 |
| Test Loss                 |         2.320193369388581 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                     0.285 |
| Time Training             |         2.842226982116699 |
| Time Testing              |       0.24674773216247559 |


100%|██████████| 100/100 [00:02<00:00, 35.61it/s]
  4%|▍         | 4/100 [00:00<00:02, 33.58it/s]

| Iteration                 |                        91 |
| Gradient Steps            |                      9100 |
| Average Train Loss        |        2.0365747466683386 |
| Start Train Loss          |        1.5149340629577637 |
| Final Train Loss          |         2.293818950653076 |
| Test Loss                 |        2.1572756791114807 |
| Test Accuracy             |        0.3400000000000001 |
| Train Accuracy            |                      0.29 |
| Time Training             |        2.8137826919555664 |
| Time Testing              |        0.2423093318939209 |


100%|██████████| 100/100 [00:02<00:00, 35.70it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.37it/s]

| Iteration                 |                        92 |
| Gradient Steps            |                      9200 |
| Average Train Loss        |        2.0546149051189424 |
| Start Train Loss          |         2.097479820251465 |
| Final Train Loss          |        1.5865147113800049 |
| Test Loss                 |         2.432087540626526 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                      0.23 |
| Time Training             |        2.8049285411834717 |
| Time Testing              |        0.2405872344970703 |


100%|██████████| 100/100 [00:02<00:00, 35.34it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.04it/s]

| Iteration                 |                        93 |
| Gradient Steps            |                      9300 |
| Average Train Loss        |        2.2523505783081053 |
| Start Train Loss          |         2.825763702392578 |
| Final Train Loss          |         2.073662281036377 |
| Test Loss                 |         2.142465536594391 |
| Test Accuracy             |       0.19999999999999996 |
| Train Accuracy            |                       0.2 |
| Time Training             |         2.832864999771118 |
| Time Testing              |        0.2467813491821289 |


100%|██████████| 100/100 [00:02<00:00, 35.39it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.47it/s]

| Iteration                 |                        94 |
| Gradient Steps            |                      9400 |
| Average Train Loss        |        2.1271733796596526 |
| Start Train Loss          |        2.2758188247680664 |
| Final Train Loss          |        2.0566391944885254 |
| Test Loss                 |        2.1218831849098208 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                      0.27 |
| Time Training             |        2.8321890830993652 |
| Time Testing              |       0.24795770645141602 |


100%|██████████| 100/100 [00:02<00:00, 35.25it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.43it/s]

| Iteration                 |                        95 |
| Gradient Steps            |                      9500 |
| Average Train Loss        |         2.139357957839966 |
| Start Train Loss          |        1.8123257160186768 |
| Final Train Loss          |         2.191851854324341 |
| Test Loss                 |        2.3340144157409672 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                     0.215 |
| Time Training             |        2.8449180126190186 |
| Time Testing              |        0.2426469326019287 |


100%|██████████| 100/100 [00:02<00:00, 35.29it/s]
  4%|▍         | 4/100 [00:00<00:02, 34.67it/s]

| Iteration                 |                        96 |
| Gradient Steps            |                      9600 |
| Average Train Loss        |        2.0702919483184816 |
| Start Train Loss          |        1.6838301420211792 |
| Final Train Loss          |         1.956982135772705 |
| Test Loss                 |        2.3516430783271782 |
| Test Accuracy             |                       0.2 |
| Train Accuracy            |                       0.2 |
| Time Training             |        2.8475992679595947 |
| Time Testing              |        0.2433311939239502 |


100%|██████████| 100/100 [00:02<00:00, 35.53it/s]
  4%|▍         | 4/100 [00:00<00:02, 37.28it/s]

| Iteration                 |                        97 |
| Gradient Steps            |                      9700 |
| Average Train Loss        |         2.185906674861908 |
| Start Train Loss          |        1.2355823516845703 |
| Final Train Loss          |        2.4035003185272217 |
| Test Loss                 |        2.3427897572517393 |
| Test Accuracy             |                      0.18 |
| Train Accuracy            |                      0.21 |
| Time Training             |        2.8203234672546387 |
| Time Testing              |       0.23686838150024414 |


100%|██████████| 100/100 [00:02<00:00, 34.81it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.17it/s]

| Iteration                 |                        98 |
| Gradient Steps            |                      9800 |
| Average Train Loss        |         2.127063576579094 |
| Start Train Loss          |         2.602604389190674 |
| Final Train Loss          |        2.0949909687042236 |
| Test Loss                 |          2.50555433511734 |
| Test Accuracy             |                       0.2 |
| Train Accuracy            |                      0.21 |
| Time Training             |        2.8813512325286865 |
| Time Testing              |       0.24173808097839355 |


100%|██████████| 100/100 [00:02<00:00, 35.32it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.89it/s]

| Iteration                 |                        99 |
| Gradient Steps            |                      9900 |
| Average Train Loss        |         2.152519099116325 |
| Start Train Loss          |        2.0629689693450928 |
| Final Train Loss          |        1.8181618452072144 |
| Test Loss                 |        2.3947090196609495 |
| Test Accuracy             |       0.25999999999999995 |
| Train Accuracy            |                       0.2 |
| Time Training             |         2.837556838989258 |
| Time Testing              |       0.25353121757507324 |


100%|██████████| 100/100 [00:02<00:00, 33.66it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.52it/s]

| Iteration                 |                       100 |
| Gradient Steps            |                     10000 |
| Average Train Loss        |        2.0663076788187027 |
| Start Train Loss          |        1.6601836681365967 |
| Final Train Loss          |        2.7378578186035156 |
| Test Loss                 |        2.4828372788429256 |
| Test Accuracy             |                      0.26 |
| Train Accuracy            |                     0.265 |
| Time Training             |        2.9743664264678955 |
| Time Testing              |       0.23135805130004883 |


100%|██████████| 100/100 [00:02<00:00, 35.18it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.08it/s]

| Iteration                 |                       101 |
| Gradient Steps            |                     10100 |
| Average Train Loss        |         2.134927181005478 |
| Start Train Loss          |        1.8487646579742432 |
| Final Train Loss          |        2.4069275856018066 |
| Test Loss                 |        2.7210183548927307 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                     0.225 |
| Time Training             |          2.84822678565979 |
| Time Testing              |       0.24851536750793457 |


100%|██████████| 100/100 [00:02<00:00, 34.96it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.20it/s]

| Iteration                 |                       102 |
| Gradient Steps            |                     10200 |
| Average Train Loss        |         2.102856104373932 |
| Start Train Loss          |         2.267443895339966 |
| Final Train Loss          |         2.638615608215332 |
| Test Loss                 |        2.2693365335464475 |
| Test Accuracy             |       0.30000000000000004 |
| Train Accuracy            |                     0.225 |
| Time Training             |         2.866159439086914 |
| Time Testing              |       0.23967695236206055 |


100%|██████████| 100/100 [00:02<00:00, 35.05it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.53it/s]

| Iteration                 |                       103 |
| Gradient Steps            |                     10300 |
| Average Train Loss        |        2.1067797183990478 |
| Start Train Loss          |        2.2559757232666016 |
| Final Train Loss          |         1.878947377204895 |
| Test Loss                 |        2.1752682590484613 |
| Test Accuracy             |       0.27999999999999997 |
| Train Accuracy            |                      0.23 |
| Time Training             |        2.8560876846313477 |
| Time Testing              |       0.23903322219848633 |


100%|██████████| 100/100 [00:02<00:00, 35.63it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.30it/s]

| Iteration                 |                       104 |
| Gradient Steps            |                     10400 |
| Average Train Loss        |        2.1619964873790742 |
| Start Train Loss          |        2.2973742485046387 |
| Final Train Loss          |        2.6310877799987793 |
| Test Loss                 |          2.41020779132843 |
| Test Accuracy             |       0.23999999999999996 |
| Train Accuracy            |                     0.235 |
| Time Training             |        2.8151092529296875 |
| Time Testing              |       0.24792027473449707 |


100%|██████████| 100/100 [00:02<00:00, 36.06it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.31it/s]

| Iteration                 |                       105 |
| Gradient Steps            |                     10500 |
| Average Train Loss        |        2.0657375264167785 |
| Start Train Loss          |        1.7524871826171875 |
| Final Train Loss          |        1.9018101692199707 |
| Test Loss                 |         2.690364789962769 |
| Test Accuracy             |       0.19999999999999998 |
| Train Accuracy            |                      0.23 |
| Time Training             |        2.7791848182678223 |
| Time Testing              |       0.24426817893981934 |


100%|██████████| 100/100 [00:02<00:00, 35.39it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.66it/s]

| Iteration                 |                       106 |
| Gradient Steps            |                     10600 |
| Average Train Loss        |         2.189466616511345 |
| Start Train Loss          |        1.8379994630813599 |
| Final Train Loss          |        3.4918158054351807 |
| Test Loss                 |          2.22297969698906 |
| Test Accuracy             |                       0.3 |
| Train Accuracy            |                     0.195 |
| Time Training             |        2.8303115367889404 |
| Time Testing              |       0.25512146949768066 |


100%|██████████| 100/100 [00:02<00:00, 35.96it/s]
  4%|▍         | 4/100 [00:00<00:02, 35.23it/s]

| Iteration                 |                       107 |
| Gradient Steps            |                     10700 |
| Average Train Loss        |        2.1178605973720552 |
| Start Train Loss          |        1.4703404903411865 |
| Final Train Loss          |         2.376796007156372 |
| Test Loss                 |         2.855514998435974 |
| Test Accuracy             |       0.21999999999999997 |
| Train Accuracy            |                      0.26 |
| Time Training             |        2.7913904190063477 |
| Time Testing              |         0.242218017578125 |


100%|██████████| 100/100 [00:02<00:00, 35.86it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.14it/s]

| Iteration                 |                       108 |
| Gradient Steps            |                     10800 |
| Average Train Loss        |         2.170573031306267 |
| Start Train Loss          |         3.153123378753662 |
| Final Train Loss          |         1.413857340812683 |
| Test Loss                 |        2.3199185585975646 |
| Test Accuracy             |                      0.22 |
| Train Accuracy            |                     0.255 |
| Time Training             |         2.791383743286133 |
| Time Testing              |         0.245711088180542 |


100%|██████████| 100/100 [00:02<00:00, 36.12it/s]
  4%|▍         | 4/100 [00:00<00:02, 36.19it/s]

| Iteration                 |                       109 |
| Gradient Steps            |                     10900 |
| Average Train Loss        |         2.124574544429779 |
| Start Train Loss          |        1.7332072257995605 |
| Final Train Loss          |        1.5217409133911133 |
| Test Loss                 |         2.214373526573181 |
| Test Accuracy             |       0.27999999999999997 |
| Train Accuracy            |                     0.225 |
| Time Training             |         2.772923707962036 |
| Time Testing              |       0.24040460586547852 |


100%|██████████| 100/100 [00:02<00:00, 35.68it/s]


| Iteration                 |                       110 |
| Gradient Steps            |                     11000 |
| Average Train Loss        |        2.1546115827560426 |
| Start Train Loss          |         2.210190773010254 |
| Final Train Loss          |        0.8391932249069214 |
| Test Loss                 |        2.5116723954677584 |
| Test Accuracy             |                      0.26 |
| Train Accuracy            |                      0.23 |
| Time Training             |        2.8117868900299072 |
| Time Testing              |       0.23453354835510254 |
