# Dataset

In [3]:
class Dataset:

    def __init__(self, device='cpu'):
        self.device = device
        self._ind = 0

    def get_batch(self, batch_size, train=True):
        x, y = self.get_batch_np(batch_size, train=train)
        x = torch.from_numpy(x).to(device=self.device, dtype=torch.float32)
        y = torch.from_numpy(y).to(device=self.device, dtype=torch.long)
        self._ind += 1
        return x, y

    def get_batch_np(self, batch_size, train):
        raise NotImplementedError

    def start_epoch(self):
        self._ind = 0

In [7]:
#pip install einops

In [49]:
from einops import rearrange
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms


In [9]:
class MNISTDataset(Dataset):

    def __init__(self, batch_size, patch_size=None, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.batch_size = batch_size  # we fix it so we can use dataloader
        self.patch_size = patch_size  # grid of (patch_size x patch_size)

        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=0., std=1.),
        ])

        self.d_train = DataLoader(
            torchvision.datasets.MNIST('data/mnist', download=True, train=True, transform=transform),
            batch_size=batch_size, drop_last=True, shuffle=True,
        )
        self.d_test = DataLoader(
            torchvision.datasets.MNIST('data/mnist', download=True, train=False, transform=transform),
            batch_size=batch_size, drop_last=True, shuffle=True,
        )

        self.train_enum = enumerate(self.d_train)
        self.test_enum = enumerate(self.d_test)

    def get_batch(self, batch_size=None, train=True):
        if train:
            _, (x, y) = next(self.train_enum, (None, (None, None)))
            if x is None:
                self.train_enum = enumerate(self.d_train)
                _, (x, y) = next(self.train_enum)
        else:
            _, (x, y) = next(self.test_enum, (None, (None, None)))
            if x is None:
                self.test_enum = enumerate(self.d_test)
                _, (x, y) = next(self.test_enum)

        if self.patch_size is not None:
            x = rearrange(x, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=self.patch_size, p2=self.patch_size)

        x = x.to(device=self.device)
        y = y.to(device=self.device)

        self._ind += 1

        return x, y

# I - FPT - Universal Computation

In [10]:
import torch
import torch.nn as nn

In [11]:
class FPT(nn.Module):

    def __init__(
            self,
            input_dim,
            output_dim,
            model_name='gpt2',
            pretrained=False,
            return_last_only=True,
            use_embeddings_for_in=False,
            in_layer_sizes=None,
            out_layer_sizes=None,
            freeze_trans=True,
            freeze_in=False,
            freeze_pos=False,
            freeze_ln=False,
            freeze_attn=True,
            freeze_ff=True,
            freeze_out=False,
            dropout=0.1,
            orth_gain=1.41,
    ):
        super().__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.model_name = model_name
        self.return_last_only = return_last_only
        self.use_embeddings_for_in = use_embeddings_for_in

        self.in_layer_sizes = [] if in_layer_sizes is None else in_layer_sizes
        self.out_layer_sizes = [] if out_layer_sizes is None else out_layer_sizes
        self.dropout = dropout

        if 'gpt' in model_name:
            assert model_name in ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']

            from transformers import GPT2Model

            pretrained_transformer = GPT2Model.from_pretrained(model_name)
            if pretrained:
                self.transformer = pretrained_transformer
            else:
                self.transformer = GPT2Model(pretrained_transformer.config)

            if model_name == 'gpt2':
                embedding_size = 768
            elif model_name == 'gpt2-medium':
                embedding_size = 1024
            elif model_name == 'gpt2-large':
                embedding_size = 1280
            elif model_name == 'gpt2-xl':
                embedding_size = 1600

        else:
            raise NotImplementedError('model_name not implemented')

        if use_embeddings_for_in:
            self.in_net = nn.Embedding(input_dim, embedding_size)
        else:
            in_layers = []
            last_output_size = input_dim
            for size in self.in_layer_sizes:
                layer = nn.Linear(last_output_size, size)
                if orth_gain is not None:
                    torch.nn.init.orthogonal_(layer.weight, gain=orth_gain)
                layer.bias.data.zero_()

                in_layers.append(layer)
                in_layers.append(nn.ReLU())
                in_layers.append(nn.Dropout(dropout))
                last_output_size = size

            final_linear = nn.Linear(last_output_size, embedding_size)
            if orth_gain is not None:
                torch.nn.init.orthogonal_(final_linear.weight, gain=orth_gain)
            final_linear.bias.data.zero_()

            in_layers.append(final_linear)
            in_layers.append(nn.Dropout(dropout))

            self.in_net = nn.Sequential(*in_layers)

        out_layers = []
        last_output_size = embedding_size
        for size in self.out_layer_sizes:
            out_layers.append(nn.Linear(last_output_size, size))
            out_layers.append(nn.ReLU())
            out_layers.append(nn.Dropout(dropout))
            last_output_size = size
        out_layers.append(nn.Linear(last_output_size, output_dim))
        self.out_net = nn.Sequential(*out_layers)

        if freeze_trans:
            for name, p in self.transformer.named_parameters():
                name = name.lower()
                if 'ln' in name:
                    p.requires_grad = not freeze_ln
                elif 'wpe' in name:
                    p.requires_grad = not freeze_pos
                elif 'mlp' in name:
                    p.requires_grad = not freeze_ff
                elif 'attn' in name:
                    p.requires_grad = not freeze_attn
                else:
                    p.requires_grad = False
        if freeze_in:
            for p in self.in_net.parameters():
                p.requires_grad = False
        if freeze_out:
            for p in self.out_net.parameters():
                p.requires_grad = False

    def forward(self, x, output_attentions=False):

        orig_dim = x.shape[-1]
        if orig_dim != self.input_dim and not self.use_embeddings_for_in:
            if orig_dim % self.input_dim != 0:
                raise ValueError('dimension of x must be divisible by patch size')
            ratio = orig_dim // self.input_dim
            x = x.reshape(x.shape[0], x.shape[1] * ratio, self.input_dim)
        else:
            ratio = 1

        x = self.in_net(x)

        transformer_outputs = self.transformer(
            inputs_embeds=x,
            return_dict=True,
            output_attentions=output_attentions,
        )
        x = transformer_outputs.last_hidden_state

        if self.return_last_only:
            x = x[:,-ratio:]

        x = self.out_net(x)
        if self.return_last_only and ratio > 1:
            x = x.reshape(x.shape[0], x.shape[1] // ratio, ratio * self.output_dim)

        if output_attentions:
            return x, transformer_outputs.attentions
        else:
            return x

# II - Trainer - Universal Computation

In [12]:
import torch
from tqdm import tqdm

import time

In [13]:
class Trainer:

    def __init__(
            self,
            model,
            dataset,
            loss_fn,
            accuracy_fn=None,
            steps_per_epoch=100,
            test_steps_per_epoch=20,
            learning_rate=1e-3,
            batch_size=2,
            eval_batch_size=8,
            grad_accumulate=1,
    ):
        self.model = model
        self.dataset = dataset
        self.loss_fn = loss_fn
        self.acc_fn = accuracy_fn
        self.steps_per_epoch = steps_per_epoch
        self.test_steps_per_epoch = test_steps_per_epoch
        self.batch_size = batch_size
        self.eval_batch_size = eval_batch_size
        self.grad_accumulate = grad_accumulate

        self.optim = torch.optim.Adam(model.parameters(), lr=learning_rate)

        self.diagnostics = {'Gradient Steps': 0}

    def get_loss(self, x, y, return_acc=False):
        out = self.model(x)
        loss = self.loss_fn(out, y, x=x)
        if return_acc:
            if self.acc_fn is None:
                raise NotImplementedError('accuracy function not specified')
            accs = self.acc_fn(
                out.detach().cpu().numpy(),
                y.detach().cpu().numpy(),
                x=x.detach().cpu().numpy(),
            )
            return loss, accs
        return loss

    def train_epoch(self, test_steps=None):
        self.dataset.start_epoch()

        train_losses, tr_accuracy = [], 0.
        self.model.train()
        start_train_time = time.time()
        for _ in tqdm(range(self.steps_per_epoch)):
            step_loss = 0
            for _ in range(self.grad_accumulate):
                x, y = self.dataset.get_batch(self.batch_size, train=True)
                loss, acc = self.get_loss(x, y, return_acc=True)
                loss = loss / self.grad_accumulate
                loss.backward()
                step_loss += loss.detach().cpu().item()
                tr_accuracy += acc

            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.)
            self.optim.step()
            self.optim.zero_grad()

            self.diagnostics['Gradient Steps'] += 1

            train_losses.append(step_loss)
        end_train_time = time.time()

        test_steps = self.test_steps_per_epoch if test_steps is None else test_steps

        test_loss, accuracy = 0., 0.
        self.model.eval()
        start_test_time = time.time()
        with torch.no_grad():
            for _ in range(test_steps):
                x, y = self.dataset.get_batch(self.eval_batch_size, train=False)
                loss, acc = self.get_loss(x, y, return_acc=True)
                test_loss += loss.detach().cpu().item() / test_steps
                accuracy += acc / test_steps
        end_test_time = time.time()

        self.diagnostics['Average Train Loss'] = sum(train_losses) / self.steps_per_epoch
        self.diagnostics['Start Train Loss'] = train_losses[0]
        self.diagnostics['Final Train Loss'] = train_losses[-1]
        self.diagnostics['Test Loss'] = test_loss
        self.diagnostics['Test Accuracy'] = accuracy
        self.diagnostics['Train Accuracy'] = tr_accuracy / (self.steps_per_epoch * self.grad_accumulate)
        self.diagnostics['Time Training'] = end_train_time - start_train_time
        self.diagnostics['Time Testing'] = end_test_time - start_test_time

# III - Experiment. run_experiment 

In [16]:
#!pip install wandb


In [50]:
import numpy as np
import torch
import wandb

import argparse
from datetime import datetime
import random
import sys



In [18]:
def experiment(
        exp_name,
        exp_args,
        **kwargs
):

    """
    Preliminary checks
    """

    # Must be able to accumulate gradient if batch size is large
    assert 'batch_size' in kwargs
    assert kwargs['batch_size'] <= exp_args['gpu_batch_size'] or \
           kwargs['batch_size'] % exp_args['gpu_batch_size'] == 0

    """
    Create dataset, model, and trainer
    """

    task = kwargs['task']
    batch_size = kwargs['batch_size']
    patch_size = kwargs['patch_size']
    device = exp_args['device']

    return_last_only = True

    if task == 'bit-memory':
        from universal_computation.datasets.bit_memory import BitMemoryDataset
        dataset = BitMemoryDataset(n=kwargs['n'], num_patterns=kwargs['num_patterns'], device=device)
        input_dim = kwargs['n'] if patch_size is None else patch_size
        output_dim = 2*kwargs['n'] if patch_size is None else 2 * patch_size
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'bit-xor':
        from universal_computation.datasets.bit_xor import BitXORDataset
        dataset = BitXORDataset(n=kwargs['n'], num_patterns=kwargs['num_patterns'], device=device)
        input_dim = kwargs['n'] if patch_size is None else patch_size
        output_dim = 2 * kwargs['n'] if patch_size is None else 2 * patch_size
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'mnist':
        #from universal_computation.datasets.mnist import MNISTDataset
        dataset = MNISTDataset(batch_size=batch_size, patch_size=patch_size, device=device)
        input_dim, output_dim = patch_size ** 2, 10
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'cifar10':
        from universal_computation.datasets.cifar10 import CIFAR10Dataset
        dataset = CIFAR10Dataset(batch_size=batch_size, patch_size=patch_size, device=device)
        input_dim, output_dim = 3 * patch_size**2, 10
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'cifar10-gray':
        from universal_computation.datasets.cifar10_gray import CIFAR10GrayDataset
        dataset = CIFAR10GrayDataset(batch_size=batch_size, patch_size=patch_size, device=device)
        input_dim, output_dim = patch_size**2, 10
        use_embeddings = False
        experiment_type = 'classification'

    elif task == 'listops':
        from universal_computation.datasets.listops import ListopsDataset
        dataset = ListopsDataset(batch_size=batch_size, device=device)
        input_dim, output_dim = 15, 10
        use_embeddings = True
        experiment_type = 'classification'
    else:
        raise NotImplementedError('dataset not implemented')

    if 'bit' in task:

        ce_loss = torch.nn.CrossEntropyLoss()

        def loss_fn(out, y, x=None):
            out = torch.reshape(out, (-1, kwargs['n'], 2))
            ids = torch.zeros(y.shape).to(device=y.device).long()
            if task == 'bit-memory':
                ids[y < 0], ids[y > 0] = 0, 1
            else:
                ids[y < 0.5], ids[y > 0.5] = 0, 1
            out, ids = torch.reshape(out, (-1, 2)), torch.reshape(ids, (-1,))
            return ce_loss(out, ids)

        def accuracy_fn(preds, true, x=None):
            if task == 'bit-memory':
                preds = preds.reshape(-1, kwargs['n'], 2).argmax(-1) * 2 - 1
            else:
                preds = preds.reshape(-1, kwargs['n'], 2).argmax(-1)
            if task == 'bit-memory':
                return (np.sign(preds) == np.sign(true)).mean()
            else:
                return ((preds > 0.5) == (true > 0.5)).mean()

    elif experiment_type == 'classification':

        ce_loss = torch.nn.CrossEntropyLoss()

        def loss_fn(out, y, x=None):
            out = out[:, 0]
            return ce_loss(out, y)

        def accuracy_fn(preds, true, x=None):
            preds = preds[:, 0].argmax(-1)
            return (preds == true).mean()

    else:
        raise NotImplementedError('experiment_type not recognized')

    model = FPT(
        input_dim=input_dim,
        output_dim=output_dim,
        model_name=kwargs.get('model_name', 'gpt2'),
        pretrained=kwargs.get('pretrained', True),
        return_last_only=return_last_only,
        use_embeddings_for_in=use_embeddings,
        in_layer_sizes=kwargs.get('in_layer_sizes', None),
        out_layer_sizes=kwargs.get('out_layer_sizes', None),
        freeze_trans=kwargs.get('freeze_trans', True),
        freeze_in=kwargs.get('freeze_in', False),
        freeze_pos=kwargs.get('freeze_pos', False),
        freeze_ln=kwargs.get('freeze_ln', False),
        freeze_attn=kwargs.get('freeze_attn', True),
        freeze_ff=kwargs.get('freeze_ff', True),
        freeze_out=kwargs.get('freeze_out', False),
        dropout=kwargs['dropout'],
        orth_gain=kwargs['orth_gain'],
    )
    model.to(device)

    gpu_batch_size = exp_args['gpu_batch_size']
    trainer = Trainer(
        model,
        dataset,
        loss_fn=loss_fn,
        accuracy_fn=accuracy_fn,
        steps_per_epoch=exp_args['steps_per_iter'],
        test_steps_per_epoch=exp_args['test_steps_per_iter'],
        learning_rate=kwargs['learning_rate'],
        batch_size=gpu_batch_size if batch_size > gpu_batch_size else batch_size,
        eval_batch_size=batch_size,
        grad_accumulate=batch_size // gpu_batch_size if batch_size > gpu_batch_size else 1,
    )

    """
    Set up logging
    """

    log_to_wandb = exp_args['log_to_wandb']
    save_models = exp_args['save_models']
    wandb_project = exp_args['wandb_project']

    short_name = str(random.randint(int(1e5), int(1e6) - 1))
    run_name = f'{exp_name}-{task}-{short_name}'

    if log_to_wandb:
        config = dict(
            short_name=short_name,
            run_name=run_name,
            **exp_args,
            **kwargs,
        )
        wandb.init(
            name=f'{exp_name}-{short_name}',
            group=f'{exp_name}-{task}',
            project=wandb_project,
            config=config,
        )
        wandb.watch(model)

    for t in range(exp_args['num_iters']):
        trainer.train_epoch()

        print('=' * 57)
        print(f'| Iteration {" " * 15} | {t+1:25} |')
        for k, v in trainer.diagnostics.items():
            print(f'| {k:25} | {v:25} |')

        if log_to_wandb:
            wandb.log(trainer.diagnostics)

        if save_models and ((t+1) % exp_args['save_models_every'] == 0 or
                            (t+1) == exp_args['num_iters']):
            with open(f'models/{run_name}.pt', 'wb') as f:
                state_dict = dict(model=model.state_dict(), optim=trainer.optim.state_dict())
                torch.save(state_dict, f)
            print(f'Saved model at {t+1} iters: {run_name}')


In [42]:
def run_experiment(
        exp_name,
        experiment_params,
):
    parser = argparse.ArgumentParser()

    parser.add_argument('--num_iters', '-it', type=int, default=110,
                        help='Number of iterations for trainer')
    parser.add_argument('--steps_per_iter', type=int, default=100,
                        help='Number of gradient steps per iteration')
    parser.add_argument('--test_steps_per_iter', type=int, default=25,
                        help='Number of test gradient steps per iteration')

    parser.add_argument('--log_to_wandb', '-w', type=bool, default=False,
                        help='Whether or not to log to Weights and Biases')
    parser.add_argument('--note', '-n', type=str, default='',
                        help='An optional note to be logged to W&B')
    parser.add_argument('--wandb_project', type=str, default='my_project',
                        help='Project name for W&B')
    parser.add_argument('--include_date', type=bool, default=True,
                        help='Whether to include date in run name')

    parser.add_argument('--save_models', '-s', type=bool, default=False,
                        help='Whether or not to save the model files locally')
    parser.add_argument('--save_models_every', '-int', type=int, default=25,
                        help='How often to save models locally')

    parser.add_argument('--device', '-d', type=str, default='cuda',
                        help='Which device for Pytorch to use')
    parser.add_argument('--gpu_batch_size', '-gbs', type=int, default=16,
                        help='Max batch size to put on GPU (used for gradient accumulation)')

    #exp_args = parser.parse_args(sys.argv[1:])
    exp_args, unknown = parser.parse_known_args(sys.argv[1:])

    if exp_args.include_date:
        timestamp = datetime.now().strftime('%m-%d')
        exp_name = f'{timestamp}-{exp_name}'

    experiment_params['exp_name'] = exp_name
    experiment_params['exp_args'] = vars(exp_args)

    experiment(xp_name=exp_name, **experiment_params)

# Run.

In [43]:
experiment_params0 = dict(
        task='mnist',
        n=1000,                # ignored if not a bit task
        num_patterns=5,        # ignored if not a bit task
        patch_size=28,

        model_name='gpt2',
        pretrained=True,

        freeze_trans=True,     # if False, we don't check arguments other than in and out
        freeze_in=False,
        freeze_pos=False,
        freeze_ln=False,
        freeze_attn=True,
        freeze_ff=True,
        freeze_out=False,

        in_layer_sizes=None,   # not in paper, but can specify layer sizes for an MLP,
        out_layer_sizes=None,  # ex. [32, 32] creates a 2-layer MLP with dimension 32

        learning_rate=1e-3,
        batch_size=2,
        dropout=0.1,
        orth_gain=1.41,
    )

In [51]:
experiment_name0 = f'fpt' 


In [48]:
run_experiment(experiment_name0, experiment_params0)

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [00:04<00:00, 22.10it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.85it/s]

| Iteration                 |                         1 |
| Gradient Steps            |                       100 |
| Average Train Loss        |         2.158504074215889 |
| Start Train Loss          |         4.139187812805176 |
| Final Train Loss          |        3.1829519271850586 |
| Test Loss                 |        1.7273107033967972 |
| Test Accuracy             |       0.42000000000000004 |
| Train Accuracy            |                      0.26 |
| Time Training             |        4.5273354053497314 |
| Time Testing              |        0.3831477165222168 |


100%|██████████| 100/100 [00:04<00:00, 22.98it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.17it/s]

| Iteration                 |                         2 |
| Gradient Steps            |                       200 |
| Average Train Loss        |         1.363590446971357 |
| Start Train Loss          |         1.414872407913208 |
| Final Train Loss          |       0.36102357506752014 |
| Test Loss                 |        0.6793074418231846 |
| Test Accuracy             |        0.7600000000000002 |
| Train Accuracy            |                      0.53 |
| Time Training             |         4.359798431396484 |
| Time Testing              |        0.4001164436340332 |


100%|██████████| 100/100 [00:04<00:00, 23.01it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.10it/s]

| Iteration                 |                         3 |
| Gradient Steps            |                       300 |
| Average Train Loss        |        0.9721134663466364 |
| Start Train Loss          |       0.25919705629348755 |
| Final Train Loss          |        0.5049428939819336 |
| Test Loss                 |        0.6107720010040795 |
| Test Accuracy             |        0.8200000000000002 |
| Train Accuracy            |                       0.7 |
| Time Training             |         4.350553750991821 |
| Time Testing              |        0.4009521007537842 |


100%|██████████| 100/100 [00:04<00:00, 22.94it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.32it/s]

| Iteration                 |                         4 |
| Gradient Steps            |                       400 |
| Average Train Loss        |        0.8827099716383964 |
| Start Train Loss          |        1.4607793092727661 |
| Final Train Loss          |        1.7514389753341675 |
| Test Loss                 |        0.6092490207836819 |
| Test Accuracy             |        0.8400000000000002 |
| Train Accuracy            |                      0.79 |
| Time Training             |         4.363433599472046 |
| Time Testing              |       0.39037442207336426 |


100%|██████████| 100/100 [00:04<00:00, 22.78it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.41it/s]

| Iteration                 |                         5 |
| Gradient Steps            |                       500 |
| Average Train Loss        |         1.071262854236229 |
| Start Train Loss          |       0.28652510046958923 |
| Final Train Loss          |        0.6767140030860901 |
| Test Loss                 |        0.7539098479470705 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                      0.71 |
| Time Training             |         4.399672985076904 |
| Time Testing              |        0.3832705020904541 |


100%|██████████| 100/100 [00:04<00:00, 22.76it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.99it/s]

| Iteration                 |                         6 |
| Gradient Steps            |                       600 |
| Average Train Loss        |        1.0676154598925496 |
| Start Train Loss          |         4.203566074371338 |
| Final Train Loss          |        0.2196483463048935 |
| Test Loss                 |       0.32467489176779046 |
| Test Accuracy             |        0.8800000000000002 |
| Train Accuracy            |                     0.735 |
| Time Training             |         4.399600028991699 |
| Time Testing              |        0.3976895809173584 |


100%|██████████| 100/100 [00:04<00:00, 22.86it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.98it/s]

| Iteration                 |                         7 |
| Gradient Steps            |                       700 |
| Average Train Loss        |        1.1187392731975705 |
| Start Train Loss          |     0.0026920337695628405 |
| Final Train Loss          |      0.010535411536693573 |
| Test Loss                 |        0.3870452529808972 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.745 |
| Time Training             |        4.3823230266571045 |
| Time Testing              |        0.3918476104736328 |


100%|██████████| 100/100 [00:04<00:00, 22.14it/s]
  3%|▎         | 3/100 [00:00<00:04, 20.45it/s]

| Iteration                 |                         8 |
| Gradient Steps            |                       800 |
| Average Train Loss        |        0.9406709603534545 |
| Start Train Loss          |         0.032346211373806 |
| Final Train Loss          |        0.4652498662471771 |
| Test Loss                 |        0.2732291138312847 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                      0.79 |
| Time Training             |         4.521879434585571 |
| Time Testing              |       0.41423606872558594 |


100%|██████████| 100/100 [00:04<00:00, 21.38it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.14it/s]

| Iteration                 |                         9 |
| Gradient Steps            |                       900 |
| Average Train Loss        |        0.8123349309853984 |
| Start Train Loss          |     0.0002803017559926957 |
| Final Train Loss          |       0.08029462397098541 |
| Test Loss                 |        0.8659170554187585 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                     0.795 |
| Time Training             |         4.681873559951782 |
| Time Testing              |        0.4236714839935303 |


100%|██████████| 100/100 [00:04<00:00, 21.39it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.93it/s]

| Iteration                 |                        10 |
| Gradient Steps            |                      1000 |
| Average Train Loss        |        0.9067690661291362 |
| Start Train Loss          |       0.24736858904361725 |
| Final Train Loss          |          4.28438138961792 |
| Test Loss                 |        1.2236698045148657 |
| Test Accuracy             |        0.7200000000000002 |
| Train Accuracy            |                     0.785 |
| Time Training             |         4.681503772735596 |
| Time Testing              |        0.4145805835723877 |


100%|██████████| 100/100 [00:04<00:00, 21.62it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.58it/s]

| Iteration                 |                        11 |
| Gradient Steps            |                      1100 |
| Average Train Loss        |           0.7802486041058 |
| Start Train Loss          |       0.06808295100927353 |
| Final Train Loss          |      0.006573087070137262 |
| Test Loss                 |       0.35032464948028513 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                      0.86 |
| Time Training             |        4.6335930824279785 |
| Time Testing              |        0.4041025638580322 |


100%|██████████| 100/100 [00:04<00:00, 21.49it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.68it/s]

| Iteration                 |                        12 |
| Gradient Steps            |                      1200 |
| Average Train Loss        |        0.8047147800933254 |
| Start Train Loss          |        2.8301150798797607 |
| Final Train Loss          |     0.0061555770225822926 |
| Test Loss                 |        0.6326809718105139 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                     0.845 |
| Time Training             |        4.6589131355285645 |
| Time Testing              |        0.4027886390686035 |


100%|██████████| 100/100 [00:04<00:00, 23.11it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.23it/s]

| Iteration                 |                        13 |
| Gradient Steps            |                      1300 |
| Average Train Loss        |         0.590535694574071 |
| Start Train Loss          |       0.00868721678853035 |
| Final Train Loss          |        0.0208662711083889 |
| Test Loss                 |        1.0278758606930334 |
| Test Accuracy             |        0.8400000000000003 |
| Train Accuracy            |                     0.845 |
| Time Training             |         4.335686206817627 |
| Time Testing              |        0.4048893451690674 |


100%|██████████| 100/100 [00:04<00:00, 23.06it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.39it/s]

| Iteration                 |                        14 |
| Gradient Steps            |                      1400 |
| Average Train Loss        |        0.8928900271381472 |
| Start Train Loss          |       0.17791853845119476 |
| Final Train Loss          |     0.0005462787230499089 |
| Test Loss                 |        1.0424740792660852 |
| Test Accuracy             |        0.8200000000000002 |
| Train Accuracy            |                      0.83 |
| Time Training             |         4.346104383468628 |
| Time Testing              |        0.3958628177642822 |


100%|██████████| 100/100 [00:04<00:00, 23.29it/s]
  3%|▎         | 3/100 [00:00<00:04, 24.00it/s]

| Iteration                 |                        15 |
| Gradient Steps            |                      1500 |
| Average Train Loss        |        0.7746805836025931 |
| Start Train Loss          |        0.7240534424781799 |
| Final Train Loss          |      0.000551339762751013 |
| Test Loss                 |        1.5459948122182319 |
| Test Accuracy             |        0.8400000000000002 |
| Train Accuracy            |                     0.845 |
| Time Training             |        4.2977211475372314 |
| Time Testing              |        0.3880150318145752 |


100%|██████████| 100/100 [00:04<00:00, 22.95it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.65it/s]

| Iteration                 |                        16 |
| Gradient Steps            |                      1600 |
| Average Train Loss        |        1.0012748881028664 |
| Start Train Loss          |         9.375897407531738 |
| Final Train Loss          |        1.3595391511917114 |
| Test Loss                 |         0.594485704928611 |
| Test Accuracy             |        0.8600000000000003 |
| Train Accuracy            |                      0.82 |
| Time Training             |         4.363073110580444 |
| Time Testing              |        0.3899993896484375 |


100%|██████████| 100/100 [00:04<00:00, 23.30it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.56it/s]

| Iteration                 |                        17 |
| Gradient Steps            |                      1700 |
| Average Train Loss        |        0.6003553303521673 |
| Start Train Loss          |     0.0010434520663693547 |
| Final Train Loss          |       0.17555555701255798 |
| Test Loss                 |       0.30790144984280343 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.865 |
| Time Training             |         4.298399925231934 |
| Time Testing              |        0.3861212730407715 |


100%|██████████| 100/100 [00:04<00:00, 23.29it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.26it/s]

| Iteration                 |                        18 |
| Gradient Steps            |                      1800 |
| Average Train Loss        |        0.5816986539564248 |
| Start Train Loss          |     0.0013656096998602152 |
| Final Train Loss          |     6.675681106571574e-06 |
| Test Loss                 |        0.6783920769474936 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                      0.84 |
| Time Training             |         4.302639007568359 |
| Time Testing              |       0.38394927978515625 |


100%|██████████| 100/100 [00:04<00:00, 23.07it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.71it/s]

| Iteration                 |                        19 |
| Gradient Steps            |                      1900 |
| Average Train Loss        |        0.6477606842673816 |
| Start Train Loss          |     0.0011618341086432338 |
| Final Train Loss          |       0.01834496296942234 |
| Test Loss                 |        0.3695951115779997 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.855 |
| Time Training             |        4.3433942794799805 |
| Time Testing              |       0.39621901512145996 |


100%|██████████| 100/100 [00:04<00:00, 23.67it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.91it/s]

| Iteration                 |                        20 |
| Gradient Steps            |                      2000 |
| Average Train Loss        |        0.6369330317947183 |
| Start Train Loss          |        0.2279832512140274 |
| Final Train Loss          |      8.94061759026954e-06 |
| Test Loss                 |       0.10218304886762752 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.89 |
| Time Training             |         4.230026721954346 |
| Time Testing              |       0.39623093605041504 |


100%|██████████| 100/100 [00:04<00:00, 23.06it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.43it/s]

| Iteration                 |                        21 |
| Gradient Steps            |                      2100 |
| Average Train Loss        |        0.8854583932954262 |
| Start Train Loss          |       0.13057468831539154 |
| Final Train Loss          |     0.0029514131601899862 |
| Test Loss                 |        0.5800037728392909 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                     0.885 |
| Time Training             |         4.339056491851807 |
| Time Testing              |       0.39252305030822754 |


100%|██████████| 100/100 [00:04<00:00, 23.21it/s]
  3%|▎         | 3/100 [00:00<00:04, 20.78it/s]

| Iteration                 |                        22 |
| Gradient Steps            |                      2200 |
| Average Train Loss        |         0.694309530253907 |
| Start Train Loss          |      0.014295464381575584 |
| Final Train Loss          |     0.0015178528847172856 |
| Test Loss                 |        0.4962310853313102 |
| Test Accuracy             |        0.9400000000000004 |
| Train Accuracy            |                      0.84 |
| Time Training             |         4.312457084655762 |
| Time Testing              |        0.3977537155151367 |


100%|██████████| 100/100 [00:04<00:00, 22.85it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.87it/s]

| Iteration                 |                        23 |
| Gradient Steps            |                      2300 |
| Average Train Loss        |        0.5872951851950432 |
| Start Train Loss          |       0.04068422690033913 |
| Final Train Loss          |       0.11969772726297379 |
| Test Loss                 |        0.8889497394663887 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                     0.875 |
| Time Training             |        4.3825578689575195 |
| Time Testing              |       0.40209174156188965 |


100%|██████████| 100/100 [00:04<00:00, 23.10it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.54it/s]

| Iteration                 |                        24 |
| Gradient Steps            |                      2400 |
| Average Train Loss        |        0.5215551094688021 |
| Start Train Loss          |       0.21309322118759155 |
| Final Train Loss          |    0.00012241366493981332 |
| Test Loss                 |       0.30183873949082546 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.875 |
| Time Training             |         4.335228681564331 |
| Time Testing              |        0.3951530456542969 |


100%|██████████| 100/100 [00:04<00:00, 22.96it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.17it/s]

| Iteration                 |                        25 |
| Gradient Steps            |                      2500 |
| Average Train Loss        |        0.5402963693231407 |
| Start Train Loss          |       0.01484229788184166 |
| Final Train Loss          |       0.04048703610897064 |
| Test Loss                 |       0.23210511992116606 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.875 |
| Time Training             |         4.361574411392212 |
| Time Testing              |        0.4030740261077881 |


100%|██████████| 100/100 [00:04<00:00, 22.68it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.20it/s]

| Iteration                 |                        26 |
| Gradient Steps            |                      2600 |
| Average Train Loss        |        0.5298458643674764 |
| Start Train Loss          |     0.0015900086145848036 |
| Final Train Loss          |      0.061447080224752426 |
| Test Loss                 |        0.9812029187343797 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                       0.9 |
| Time Training             |         4.414049386978149 |
| Time Testing              |       0.41872453689575195 |


100%|██████████| 100/100 [00:04<00:00, 21.64it/s]
  3%|▎         | 3/100 [00:00<00:04, 20.94it/s]

| Iteration                 |                        27 |
| Gradient Steps            |                      2700 |
| Average Train Loss        |        0.6914035155616215 |
| Start Train Loss          |         5.290715217590332 |
| Final Train Loss          |       0.03966367244720459 |
| Test Loss                 |        0.6234546757232504 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                      0.85 |
| Time Training             |         4.629016399383545 |
| Time Testing              |        0.4159224033355713 |


100%|██████████| 100/100 [00:04<00:00, 21.51it/s]
  3%|▎         | 3/100 [00:00<00:04, 20.51it/s]

| Iteration                 |                        28 |
| Gradient Steps            |                      2800 |
| Average Train Loss        |        0.5593797428190196 |
| Start Train Loss          |     0.0012758771190419793 |
| Final Train Loss          |      0.006139675620943308 |
| Test Loss                 |        0.7809731260087891 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                     0.865 |
| Time Training             |         4.656286001205444 |
| Time Testing              |        0.4069514274597168 |


100%|██████████| 100/100 [00:04<00:00, 21.24it/s]
  2%|▏         | 2/100 [00:00<00:05, 18.02it/s]

| Iteration                 |                        29 |
| Gradient Steps            |                      2900 |
| Average Train Loss        |        0.7475114318334767 |
| Start Train Loss          |          2.07116961479187 |
| Final Train Loss          |     2.157661583623849e-05 |
| Test Loss                 |        0.7331364634278816 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                      0.86 |
| Time Training             |         4.716244220733643 |
| Time Testing              |        0.4247410297393799 |


100%|██████████| 100/100 [00:04<00:00, 21.26it/s]
  2%|▏         | 2/100 [00:00<00:05, 19.55it/s]

| Iteration                 |                        30 |
| Gradient Steps            |                      3000 |
| Average Train Loss        |        0.7360029203944669 |
| Start Train Loss          |     0.0014771526912227273 |
| Final Train Loss          |     0.0023372448049485683 |
| Test Loss                 |        0.6533265881472833 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                      0.84 |
| Time Training             |         4.713497638702393 |
| Time Testing              |        0.4161076545715332 |


100%|██████████| 100/100 [00:04<00:00, 21.83it/s]
  2%|▏         | 2/100 [00:00<00:04, 19.87it/s]

| Iteration                 |                        31 |
| Gradient Steps            |                      3100 |
| Average Train Loss        |        0.5635420378498156 |
| Start Train Loss          |    1.8059896319755353e-05 |
| Final Train Loss          |    0.00012784187856595963 |
| Test Loss                 |        0.6398134033887913 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                      0.88 |
| Time Training             |         4.586308717727661 |
| Time Testing              |       0.40503692626953125 |


100%|██████████| 100/100 [00:04<00:00, 22.79it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.37it/s]

| Iteration                 |                        32 |
| Gradient Steps            |                      3200 |
| Average Train Loss        |        0.4487583846983405 |
| Start Train Loss          |      0.010103611275553703 |
| Final Train Loss          |        0.1594768464565277 |
| Test Loss                 |          0.72558002258515 |
| Test Accuracy             |        0.8800000000000002 |
| Train Accuracy            |                      0.89 |
| Time Training             |         4.391302824020386 |
| Time Testing              |        0.4030587673187256 |


100%|██████████| 100/100 [00:04<00:00, 22.58it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.53it/s]

| Iteration                 |                        33 |
| Gradient Steps            |                      3300 |
| Average Train Loss        |        0.6246867608606053 |
| Start Train Loss          |    1.2040002729918342e-05 |
| Final Train Loss          |      0.002140655880793929 |
| Test Loss                 |        0.5388165070993637 |
| Test Accuracy             |        0.8800000000000002 |
| Train Accuracy            |                     0.865 |
| Time Training             |         4.440267562866211 |
| Time Testing              |       0.40177273750305176 |


100%|██████████| 100/100 [00:04<00:00, 22.20it/s]
  3%|▎         | 3/100 [00:00<00:03, 24.44it/s]

| Iteration                 |                        34 |
| Gradient Steps            |                      3400 |
| Average Train Loss        |        0.6445240555620795 |
| Start Train Loss          |        3.6368067264556885 |
| Final Train Loss          |       0.26837554574012756 |
| Test Loss                 |        0.1451800411183376 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                      0.87 |
| Time Training             |         4.513893365859985 |
| Time Testing              |       0.39949941635131836 |


100%|██████████| 100/100 [00:04<00:00, 23.04it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.48it/s]

| Iteration                 |                        35 |
| Gradient Steps            |                      3500 |
| Average Train Loss        |        0.3968741650044981 |
| Start Train Loss          |       0.02664702758193016 |
| Final Train Loss          |      0.002504505682736635 |
| Test Loss                 |       0.34965085011808605 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.885 |
| Time Training             |         4.348989725112915 |
| Time Testing              |       0.39789605140686035 |


100%|██████████| 100/100 [00:04<00:00, 22.22it/s]
  3%|▎         | 3/100 [00:00<00:04, 24.00it/s]

| Iteration                 |                        36 |
| Gradient Steps            |                      3600 |
| Average Train Loss        |        0.6500038141227245 |
| Start Train Loss          |       0.07169856876134872 |
| Final Train Loss          |      0.002016649581491947 |
| Test Loss                 |       0.24892487645194478 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.895 |
| Time Training             |         4.506268501281738 |
| Time Testing              |        0.3858375549316406 |


100%|██████████| 100/100 [00:04<00:00, 22.92it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.53it/s]

| Iteration                 |                        37 |
| Gradient Steps            |                      3700 |
| Average Train Loss        |        0.6444311140285515 |
| Start Train Loss          |    0.00034886723733507097 |
| Final Train Loss          |       3.7370715290308e-05 |
| Test Loss                 |       0.29896411788923843 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.88 |
| Time Training             |        4.3735058307647705 |
| Time Testing              |       0.40787839889526367 |


100%|██████████| 100/100 [00:04<00:00, 22.17it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.77it/s]

| Iteration                 |                        38 |
| Gradient Steps            |                      3800 |
| Average Train Loss        |         0.565803437509295 |
| Start Train Loss          |      0.012096486054360867 |
| Final Train Loss          |       0.03938059136271477 |
| Test Loss                 |        0.9632124155105725 |
| Test Accuracy             |        0.8800000000000002 |
| Train Accuracy            |                      0.87 |
| Time Training             |         4.518276214599609 |
| Time Testing              |       0.40561938285827637 |


100%|██████████| 100/100 [00:04<00:00, 22.33it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.26it/s]

| Iteration                 |                        39 |
| Gradient Steps            |                      3900 |
| Average Train Loss        |        0.5053138864174792 |
| Start Train Loss          |      0.015714135020971298 |
| Final Train Loss          |    5.9604641222676946e-08 |
| Test Loss                 |        0.6568064898080394 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                      0.87 |
| Time Training             |        4.4880170822143555 |
| Time Testing              |        0.4002726078033447 |


100%|██████████| 100/100 [00:04<00:00, 22.19it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.35it/s]

| Iteration                 |                        40 |
| Gradient Steps            |                      4000 |
| Average Train Loss        |        0.7074484841790732 |
| Start Train Loss          |     0.0020412160083651543 |
| Final Train Loss          |    0.00011585806350922212 |
| Test Loss                 |       0.21668955871241452 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.885 |
| Time Training             |         4.516316175460815 |
| Time Testing              |        0.4203314781188965 |


100%|██████████| 100/100 [00:04<00:00, 22.45it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.23it/s]

| Iteration                 |                        41 |
| Gradient Steps            |                      4100 |
| Average Train Loss        |        0.5306287801841291 |
| Start Train Loss          |      0.032028138637542725 |
| Final Train Loss          |     0.0034200248774141073 |
| Test Loss                 |       0.07801266197896893 |
| Test Accuracy             |        0.9800000000000003 |
| Train Accuracy            |                     0.875 |
| Time Training             |         4.463887453079224 |
| Time Testing              |       0.40178656578063965 |


100%|██████████| 100/100 [00:04<00:00, 22.18it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.99it/s]

| Iteration                 |                        42 |
| Gradient Steps            |                      4200 |
| Average Train Loss        |       0.48757299520597813 |
| Start Train Loss          |       0.14980708062648773 |
| Final Train Loss          |        0.9638267159461975 |
| Test Loss                 |        0.8856823380552578 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                     0.885 |
| Time Training             |         4.514246702194214 |
| Time Testing              |       0.39505434036254883 |


100%|██████████| 100/100 [00:04<00:00, 22.32it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.65it/s]

| Iteration                 |                        43 |
| Gradient Steps            |                      4300 |
| Average Train Loss        |        0.5302355108358603 |
| Start Train Loss          |      0.007105561904609203 |
| Final Train Loss          |     0.0001079328139894642 |
| Test Loss                 |        0.6894567076274325 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                     0.865 |
| Time Training             |         4.494251489639282 |
| Time Testing              |       0.40173768997192383 |


100%|██████████| 100/100 [00:04<00:00, 22.01it/s]
  3%|▎         | 3/100 [00:00<00:03, 24.66it/s]

| Iteration                 |                        44 |
| Gradient Steps            |                      4400 |
| Average Train Loss        |        0.4793397617770643 |
| Start Train Loss          |       0.01395662222057581 |
| Final Train Loss          |       0.06050682067871094 |
| Test Loss                 |      0.042691412686683625 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.89 |
| Time Training             |         4.546006679534912 |
| Time Testing              |       0.39115023612976074 |


100%|██████████| 100/100 [00:04<00:00, 22.83it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.46it/s]

| Iteration                 |                        45 |
| Gradient Steps            |                      4500 |
| Average Train Loss        |        0.3915506950245953 |
| Start Train Loss          |     2.986144318128936e-05 |
| Final Train Loss          |                       0.0 |
| Test Loss                 |        0.9364225037461983 |
| Test Accuracy             |        0.8200000000000003 |
| Train Accuracy            |                      0.89 |
| Time Training             |         4.386701583862305 |
| Time Testing              |        0.4172983169555664 |


100%|██████████| 100/100 [00:04<00:00, 22.70it/s]
  2%|▏         | 2/100 [00:00<00:05, 19.21it/s]

| Iteration                 |                        46 |
| Gradient Steps            |                      4600 |
| Average Train Loss        |        0.8135000673287358 |
| Start Train Loss          |     0.0049956622533500195 |
| Final Train Loss          |     8.498902025166899e-05 |
| Test Loss                 |         1.003407195596856 |
| Test Accuracy             |        0.8800000000000003 |
| Train Accuracy            |                     0.885 |
| Time Training             |        4.4089391231536865 |
| Time Testing              |       0.39122533798217773 |


100%|██████████| 100/100 [00:04<00:00, 22.74it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.58it/s]

| Iteration                 |                        47 |
| Gradient Steps            |                      4700 |
| Average Train Loss        |       0.43879281943806914 |
| Start Train Loss          |         4.101009845733643 |
| Final Train Loss          |         1.025124430656433 |
| Test Loss                 |       0.39244073210087665 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                     0.895 |
| Time Training             |         4.407239198684692 |
| Time Testing              |        0.3876361846923828 |


100%|██████████| 100/100 [00:04<00:00, 22.95it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.82it/s]

| Iteration                 |                        48 |
| Gradient Steps            |                      4800 |
| Average Train Loss        |        0.3418182104066622 |
| Start Train Loss          |     0.0015568058006465435 |
| Final Train Loss          |    5.3047861001687124e-06 |
| Test Loss                 |        0.5141978782933478 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                     0.925 |
| Time Training             |         4.360589027404785 |
| Time Testing              |       0.40575528144836426 |


100%|██████████| 100/100 [00:04<00:00, 23.24it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.11it/s]

| Iteration                 |                        49 |
| Gradient Steps            |                      4900 |
| Average Train Loss        |        0.6191453185776403 |
| Start Train Loss          |      0.004611906595528126 |
| Final Train Loss          |    2.1874446247238666e-05 |
| Test Loss                 |       0.34147255379514824 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.92 |
| Time Training             |         4.308555841445923 |
| Time Testing              |         0.410064697265625 |


100%|██████████| 100/100 [00:04<00:00, 23.06it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.34it/s]

| Iteration                 |                        50 |
| Gradient Steps            |                      5000 |
| Average Train Loss        |       0.43843489165451416 |
| Start Train Loss          |      0.035950638353824615 |
| Final Train Loss          |     0.0016539732459932566 |
| Test Loss                 |        0.2823339991249908 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.92 |
| Time Training             |          4.34035849571228 |
| Time Testing              |        0.4030005931854248 |


100%|██████████| 100/100 [00:04<00:00, 23.10it/s]
  2%|▏         | 2/100 [00:00<00:05, 17.84it/s]

| Iteration                 |                        51 |
| Gradient Steps            |                      5100 |
| Average Train Loss        |        0.8425110265189738 |
| Start Train Loss          |         5.086935043334961 |
| Final Train Loss          |       0.06202917546033859 |
| Test Loss                 |       0.24363232967457862 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.855 |
| Time Training             |        4.3369481563568115 |
| Time Testing              |       0.39696550369262695 |


100%|██████████| 100/100 [00:04<00:00, 22.81it/s]
  2%|▏         | 2/100 [00:00<00:04, 20.00it/s]

| Iteration                 |                        52 |
| Gradient Steps            |                      5200 |
| Average Train Loss        |        0.3375444241505525 |
| Start Train Loss          |      0.045209966599941254 |
| Final Train Loss          |    2.0265554212528514e-06 |
| Test Loss                 |        0.4196948258279552 |
| Test Accuracy             |        0.8800000000000002 |
| Train Accuracy            |                      0.92 |
| Time Training             |          4.38726544380188 |
| Time Testing              |        0.4153170585632324 |


100%|██████████| 100/100 [00:04<00:00, 22.81it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.92it/s]

| Iteration                 |                        53 |
| Gradient Steps            |                      5300 |
| Average Train Loss        |       0.37039824020970014 |
| Start Train Loss          |    0.00030369695741683245 |
| Final Train Loss          |     0.0005780845531262457 |
| Test Loss                 |       0.37288733421031994 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.905 |
| Time Training             |         4.391937732696533 |
| Time Testing              |       0.41140127182006836 |


100%|██████████| 100/100 [00:04<00:00, 22.82it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.19it/s]

| Iteration                 |                        54 |
| Gradient Steps            |                      5400 |
| Average Train Loss        |         0.553364323164468 |
| Start Train Loss          |     7.670529157621786e-05 |
| Final Train Loss          |                       0.0 |
| Test Loss                 |        0.8989488970303227 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.91 |
| Time Training             |          4.38909125328064 |
| Time Testing              |         0.405179500579834 |


100%|██████████| 100/100 [00:04<00:00, 23.29it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.99it/s]

| Iteration                 |                        55 |
| Gradient Steps            |                      5500 |
| Average Train Loss        |        0.4860078537051495 |
| Start Train Loss          |     0.0023754939902573824 |
| Final Train Loss          |     0.0027519301511347294 |
| Test Loss                 |        0.7650641832494216 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                      0.91 |
| Time Training             |         4.301656246185303 |
| Time Testing              |        0.3880314826965332 |


100%|██████████| 100/100 [00:04<00:00, 22.66it/s]
  3%|▎         | 3/100 [00:00<00:03, 24.93it/s]

| Iteration                 |                        56 |
| Gradient Steps            |                      5600 |
| Average Train Loss        |        0.3829953883571832 |
| Start Train Loss          |       0.02342701330780983 |
| Final Train Loss          |      0.007150121033191681 |
| Test Loss                 |       0.26840003216345976 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                     0.915 |
| Time Training             |          4.41904878616333 |
| Time Testing              |        0.3781003952026367 |


100%|██████████| 100/100 [00:04<00:00, 22.85it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.40it/s]

| Iteration                 |                        57 |
| Gradient Steps            |                      5700 |
| Average Train Loss        |        0.5760946079098821 |
| Start Train Loss          |     0.0035620261915028095 |
| Final Train Loss          |     0.0018424324225634336 |
| Test Loss                 |        0.8037579606646907 |
| Test Accuracy             |        0.7800000000000002 |
| Train Accuracy            |                     0.865 |
| Time Training             |         4.385148525238037 |
| Time Testing              |        0.3924214839935303 |


100%|██████████| 100/100 [00:04<00:00, 22.95it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.54it/s]

| Iteration                 |                        58 |
| Gradient Steps            |                      5800 |
| Average Train Loss        |        0.4935535665538754 |
| Start Train Loss          |       0.07733658701181412 |
| Final Train Loss          |    1.1980460840277374e-05 |
| Test Loss                 |        0.4282396155836842 |
| Test Accuracy             |        0.8600000000000002 |
| Train Accuracy            |                     0.895 |
| Time Training             |         4.365512371063232 |
| Time Testing              |       0.41247105598449707 |


100%|██████████| 100/100 [00:04<00:00, 22.83it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.16it/s]

| Iteration                 |                        59 |
| Gradient Steps            |                      5900 |
| Average Train Loss        |        0.3694654723865611 |
| Start Train Loss          |        0.3270336985588074 |
| Final Train Loss          |    2.3841852225814364e-07 |
| Test Loss                 |        0.5449904493115537 |
| Test Accuracy             |        0.8800000000000002 |
| Train Accuracy            |                     0.895 |
| Time Training             |          4.38671088218689 |
| Time Testing              |        0.4061267375946045 |


100%|██████████| 100/100 [00:04<00:00, 23.16it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.58it/s]

| Iteration                 |                        60 |
| Gradient Steps            |                      6000 |
| Average Train Loss        |        0.6891662817315094 |
| Start Train Loss          |        3.5163912773132324 |
| Final Train Loss          |       0.09322059899568558 |
| Test Loss                 |      0.015628261296806157 |
| Test Accuracy             |        1.0000000000000002 |
| Train Accuracy            |                     0.855 |
| Time Training             |         4.322873830795288 |
| Time Testing              |        0.3882451057434082 |


100%|██████████| 100/100 [00:04<00:00, 22.80it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.42it/s]

| Iteration                 |                        61 |
| Gradient Steps            |                      6100 |
| Average Train Loss        |       0.48003467976328124 |
| Start Train Loss          |      0.000799216446466744 |
| Final Train Loss          |     0.0014625689946115017 |
| Test Loss                 |       0.22888360097121443 |
| Test Accuracy             |        0.9600000000000004 |
| Train Accuracy            |                      0.89 |
| Time Training             |         4.393187999725342 |
| Time Testing              |        0.4069538116455078 |


100%|██████████| 100/100 [00:04<00:00, 22.91it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.82it/s]

| Iteration                 |                        62 |
| Gradient Steps            |                      6200 |
| Average Train Loss        |        0.3542146665838322 |
| Start Train Loss          |     0.0002141279837815091 |
| Final Train Loss          |       0.00571134639903903 |
| Test Loss                 |          0.78913198917396 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                       0.9 |
| Time Training             |         4.371711492538452 |
| Time Testing              |        0.4034879207611084 |


100%|██████████| 100/100 [00:04<00:00, 23.10it/s]
  3%|▎         | 3/100 [00:00<00:03, 24.62it/s]

| Iteration                 |                        63 |
| Gradient Steps            |                      6300 |
| Average Train Loss        |       0.45610217124364794 |
| Start Train Loss          |    0.00026689801597967744 |
| Final Train Loss          |      0.005515723023563623 |
| Test Loss                 |       0.21730294188819346 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                     0.875 |
| Time Training             |         4.333476781845093 |
| Time Testing              |       0.40737271308898926 |


100%|██████████| 100/100 [00:04<00:00, 23.19it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.19it/s]

| Iteration                 |                        64 |
| Gradient Steps            |                      6400 |
| Average Train Loss        |        0.3523053414318564 |
| Start Train Loss          |     0.0005393779138103127 |
| Final Train Loss          |      0.007664094679057598 |
| Test Loss                 |        0.5065559812169613 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.915 |
| Time Training             |         4.328704118728638 |
| Time Testing              |       0.40328121185302734 |


100%|██████████| 100/100 [00:04<00:00, 22.64it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.70it/s]

| Iteration                 |                        65 |
| Gradient Steps            |                      6500 |
| Average Train Loss        |        0.5503534089274573 |
| Start Train Loss          |     0.0013330084038898349 |
| Final Train Loss          |      9.77508898358792e-06 |
| Test Loss                 |        0.8158261311332053 |
| Test Accuracy             |        0.8600000000000003 |
| Train Accuracy            |                      0.89 |
| Time Training             |           4.4225013256073 |
| Time Testing              |        0.4083364009857178 |


100%|██████████| 100/100 [00:04<00:00, 22.54it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.84it/s]

| Iteration                 |                        66 |
| Gradient Steps            |                      6600 |
| Average Train Loss        |         0.506959873239698 |
| Start Train Loss          |        0.2736201286315918 |
| Final Train Loss          |     0.0009271714370697737 |
| Test Loss                 |       0.35338137515368767 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                     0.885 |
| Time Training             |        4.4477691650390625 |
| Time Testing              |       0.41168880462646484 |


100%|██████████| 100/100 [00:04<00:00, 22.93it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.16it/s]

| Iteration                 |                        67 |
| Gradient Steps            |                      6700 |
| Average Train Loss        |        0.7344502974924089 |
| Start Train Loss          |      0.001231701928190887 |
| Final Train Loss          |        2.2157227993011475 |
| Test Loss                 |        0.2193293920851647 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                      0.86 |
| Time Training             |         4.371782302856445 |
| Time Testing              |        0.4165024757385254 |


100%|██████████| 100/100 [00:04<00:00, 22.74it/s]
  3%|▎         | 3/100 [00:00<00:03, 24.66it/s]

| Iteration                 |                        68 |
| Gradient Steps            |                      6800 |
| Average Train Loss        |       0.33090128387758466 |
| Start Train Loss          |           2.6775062084198 |
| Final Train Loss          |        2.5865471363067627 |
| Test Loss                 |       0.47329045697734845 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.925 |
| Time Training             |        4.4050304889678955 |
| Time Testing              |        0.3986179828643799 |


100%|██████████| 100/100 [00:04<00:00, 22.96it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.80it/s]

| Iteration                 |                        69 |
| Gradient Steps            |                      6900 |
| Average Train Loss        |        0.5527820953315262 |
| Start Train Loss          |                       0.0 |
| Final Train Loss          |     0.0002664221974555403 |
| Test Loss                 |        0.2595990614710445 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                       0.9 |
| Time Training             |         4.364869594573975 |
| Time Testing              |        0.4028279781341553 |


100%|██████████| 100/100 [00:04<00:00, 22.41it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.65it/s]

| Iteration                 |                        70 |
| Gradient Steps            |                      7000 |
| Average Train Loss        |        0.2477045177040786 |
| Start Train Loss          |     7.009128603385761e-05 |
| Final Train Loss          |        0.2768923044204712 |
| Test Loss                 |        0.2762449070431649 |
| Test Accuracy             |        0.9200000000000004 |
| Train Accuracy            |                      0.94 |
| Time Training             |         4.474292755126953 |
| Time Testing              |       0.38553857803344727 |


100%|██████████| 100/100 [00:04<00:00, 22.84it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.27it/s]

| Iteration                 |                        71 |
| Gradient Steps            |                      7100 |
| Average Train Loss        |        0.4750996533837912 |
| Start Train Loss          |    0.00014786727842874825 |
| Final Train Loss          |     6.931778625585139e-05 |
| Test Loss                 |        0.2427500133731633 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.905 |
| Time Training             |         4.384613752365112 |
| Time Testing              |        0.3839905261993408 |


100%|██████████| 100/100 [00:04<00:00, 22.61it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.77it/s]

| Iteration                 |                        72 |
| Gradient Steps            |                      7200 |
| Average Train Loss        |       0.44091340124791656 |
| Start Train Loss          |     1.072882469088654e-06 |
| Final Train Loss          |     6.020032742526382e-06 |
| Test Loss                 |       0.47067046948623664 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                       0.9 |
| Time Training             |         4.428351640701294 |
| Time Testing              |        0.4029664993286133 |


100%|██████████| 100/100 [00:04<00:00, 22.78it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.03it/s]

| Iteration                 |                        73 |
| Gradient Steps            |                      7300 |
| Average Train Loss        |        0.2776377372079243 |
| Start Train Loss          |     0.0010022157803177834 |
| Final Train Loss          |        0.5912461280822754 |
| Test Loss                 |       0.29527339497086535 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.94 |
| Time Training             |         4.399155855178833 |
| Time Testing              |        0.3928253650665283 |


100%|██████████| 100/100 [00:04<00:00, 22.58it/s]
  3%|▎         | 3/100 [00:00<00:04, 20.43it/s]

| Iteration                 |                        74 |
| Gradient Steps            |                      7400 |
| Average Train Loss        |        0.5331980737348637 |
| Start Train Loss          |    3.5762775496550603e-07 |
| Final Train Loss          |        2.2374444007873535 |
| Test Loss                 |        0.2973762414649568 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                      0.92 |
| Time Training             |         4.432385206222534 |
| Time Testing              |        0.4111332893371582 |


100%|██████████| 100/100 [00:04<00:00, 22.79it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.23it/s]

| Iteration                 |                        75 |
| Gradient Steps            |                      7500 |
| Average Train Loss        |       0.38694059956664545 |
| Start Train Loss          |        0.6959194540977478 |
| Final Train Loss          |       0.00509636290371418 |
| Test Loss                 |        0.1560109129034913 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.93 |
| Time Training             |         4.392337322235107 |
| Time Testing              |        0.3925631046295166 |


100%|██████████| 100/100 [00:04<00:00, 22.79it/s]
  3%|▎         | 3/100 [00:00<00:03, 24.49it/s]

| Iteration                 |                        76 |
| Gradient Steps            |                      7600 |
| Average Train Loss        |        0.4513537246619154 |
| Start Train Loss          |         3.841489791870117 |
| Final Train Loss          |      0.000990860047750175 |
| Test Loss                 |       0.39415915426855735 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.92 |
| Time Training             |          4.39365029335022 |
| Time Testing              |       0.39597654342651367 |


100%|██████████| 100/100 [00:04<00:00, 22.73it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.19it/s]

| Iteration                 |                        77 |
| Gradient Steps            |                      7700 |
| Average Train Loss        |        0.5330442568616861 |
| Start Train Loss          |     4.827961220144061e-06 |
| Final Train Loss          |         8.658333778381348 |
| Test Loss                 |        0.4475764014540111 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.91 |
| Time Training             |         4.404958248138428 |
| Time Testing              |        0.4006986618041992 |


100%|██████████| 100/100 [00:04<00:00, 22.70it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.81it/s]

| Iteration                 |                        78 |
| Gradient Steps            |                      7800 |
| Average Train Loss        |       0.47888066803651874 |
| Start Train Loss          |      0.007170533761382103 |
| Final Train Loss          |    2.1993710106471553e-05 |
| Test Loss                 |       0.35611584531419793 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.92 |
| Time Training             |         4.410208702087402 |
| Time Testing              |       0.40056514739990234 |


100%|██████████| 100/100 [00:04<00:00, 22.04it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.67it/s]

| Iteration                 |                        79 |
| Gradient Steps            |                      7900 |
| Average Train Loss        |        0.4554459898315446 |
| Start Train Loss          |        0.6518527269363403 |
| Final Train Loss          |      0.045784663408994675 |
| Test Loss                 |        0.4917967845109343 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                       0.9 |
| Time Training             |         4.543384075164795 |
| Time Testing              |        0.3898181915283203 |


100%|██████████| 100/100 [00:04<00:00, 22.88it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.29it/s]

| Iteration                 |                        80 |
| Gradient Steps            |                      8000 |
| Average Train Loss        |        0.3282969589508854 |
| Start Train Loss          |    0.00012491591041907668 |
| Final Train Loss          |        0.6665182709693909 |
| Test Loss                 |       0.49232279434222626 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.92 |
| Time Training             |         4.374648332595825 |
| Time Testing              |       0.41504788398742676 |


100%|██████████| 100/100 [00:04<00:00, 22.53it/s]
  2%|▏         | 2/100 [00:00<00:05, 19.43it/s]

| Iteration                 |                        81 |
| Gradient Steps            |                      8100 |
| Average Train Loss        |        0.5930776957066689 |
| Start Train Loss          |     0.0018928137142211199 |
| Final Train Loss          |    0.00028048103558830917 |
| Test Loss                 |        0.3254679364027098 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.865 |
| Time Training             |         4.443802118301392 |
| Time Testing              |        0.4043295383453369 |


100%|██████████| 100/100 [00:04<00:00, 21.85it/s]
  3%|▎         | 3/100 [00:00<00:04, 20.33it/s]

| Iteration                 |                        82 |
| Gradient Steps            |                      8200 |
| Average Train Loss        |        0.5040183188708466 |
| Start Train Loss          |        2.5478692054748535 |
| Final Train Loss          |      0.007490405347198248 |
| Test Loss                 |       0.31153298143838587 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.89 |
| Time Training             |         4.583423137664795 |
| Time Testing              |       0.41318178176879883 |


100%|██████████| 100/100 [00:04<00:00, 21.12it/s]
  3%|▎         | 3/100 [00:00<00:04, 20.74it/s]

| Iteration                 |                        83 |
| Gradient Steps            |                      8300 |
| Average Train Loss        |        0.2827810322279953 |
| Start Train Loss          |      0.017879242077469826 |
| Final Train Loss          |     0.0005503250285983086 |
| Test Loss                 |       0.35095222445888324 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.93 |
| Time Training             |         4.738239765167236 |
| Time Testing              |         0.422161340713501 |


100%|██████████| 100/100 [00:04<00:00, 20.98it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.02it/s]

| Iteration                 |                        84 |
| Gradient Steps            |                      8400 |
| Average Train Loss        |        0.4515295326593139 |
| Start Train Loss          |    0.00021286337869241834 |
| Final Train Loss          |    0.00014666607603430748 |
| Test Loss                 |       0.45416683573103434 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.895 |
| Time Training             |         4.771329641342163 |
| Time Testing              |       0.44011521339416504 |


100%|██████████| 100/100 [00:04<00:00, 20.90it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.32it/s]

| Iteration                 |                        85 |
| Gradient Steps            |                      8500 |
| Average Train Loss        |       0.41789454370781526 |
| Start Train Loss          |    1.8238850316265598e-05 |
| Final Train Loss          |    0.00021149350504856557 |
| Test Loss                 |        0.5025650472593963 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.885 |
| Time Training             |         4.794429063796997 |
| Time Testing              |        0.4302642345428467 |


100%|██████████| 100/100 [00:04<00:00, 21.41it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.27it/s]

| Iteration                 |                        86 |
| Gradient Steps            |                      8600 |
| Average Train Loss        |        0.4105059768863643 |
| Start Train Loss          |     0.0006557940505445004 |
| Final Train Loss          |      0.009749697521328926 |
| Test Loss                 |      0.045214360281870536 |
| Test Accuracy             |        0.9800000000000003 |
| Train Accuracy            |                       0.9 |
| Time Training             |        4.6845011711120605 |
| Time Testing              |         0.418393611907959 |


100%|██████████| 100/100 [00:04<00:00, 21.59it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.40it/s]

| Iteration                 |                        87 |
| Gradient Steps            |                      8700 |
| Average Train Loss        |        0.4416117372851659 |
| Start Train Loss          |       0.17169296741485596 |
| Final Train Loss          |        0.4346620440483093 |
| Test Loss                 |       0.09352505148507048 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                     0.905 |
| Time Training             |         4.638896465301514 |
| Time Testing              |       0.40120840072631836 |


100%|██████████| 100/100 [00:04<00:00, 22.27it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.96it/s]

| Iteration                 |                        88 |
| Gradient Steps            |                      8800 |
| Average Train Loss        |        0.3333664693425459 |
| Start Train Loss          |     4.768351573147811e-06 |
| Final Train Loss          |    0.00023609788331668824 |
| Test Loss                 |        0.4928966528079273 |
| Test Accuracy             |        0.8800000000000002 |
| Train Accuracy            |                     0.905 |
| Time Training             |        4.4957005977630615 |
| Time Testing              |       0.42374420166015625 |


100%|██████████| 100/100 [00:04<00:00, 22.26it/s]
  2%|▏         | 2/100 [00:00<00:04, 20.00it/s]

| Iteration                 |                        89 |
| Gradient Steps            |                      8900 |
| Average Train Loss        |        0.3878489861064911 |
| Start Train Loss          |     0.0039978111162781715 |
| Final Train Loss          |    0.00031407742062583566 |
| Test Loss                 |        0.3005538268366468 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.935 |
| Time Training             |         4.496414422988892 |
| Time Testing              |       0.41376447677612305 |


100%|██████████| 100/100 [00:04<00:00, 22.20it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.00it/s]

| Iteration                 |                        90 |
| Gradient Steps            |                      9000 |
| Average Train Loss        |        0.4210010628106464 |
| Start Train Loss          |        3.9891092777252197 |
| Final Train Loss          |    2.3543288989458233e-05 |
| Test Loss                 |       0.25151390312281735 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                     0.905 |
| Time Training             |         4.512192487716675 |
| Time Testing              |         0.403836727142334 |


100%|██████████| 100/100 [00:04<00:00, 22.20it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.37it/s]

| Iteration                 |                        91 |
| Gradient Steps            |                      9100 |
| Average Train Loss        |       0.24671843937136065 |
| Start Train Loss          |       0.02776186354458332 |
| Final Train Loss          |    1.9848088413709775e-05 |
| Test Loss                 |        0.5302401665570848 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                      0.95 |
| Time Training             |         4.509596586227417 |
| Time Testing              |       0.40410947799682617 |


100%|██████████| 100/100 [00:04<00:00, 22.52it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.63it/s]

| Iteration                 |                        92 |
| Gradient Steps            |                      9200 |
| Average Train Loss        |        0.3664632635198089 |
| Start Train Loss          |        1.3391997814178467 |
| Final Train Loss          |        0.0578511506319046 |
| Test Loss                 |        0.1956334133714416 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.93 |
| Time Training             |         4.456659555435181 |
| Time Testing              |       0.40009307861328125 |


100%|██████████| 100/100 [00:04<00:00, 22.71it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.28it/s]

| Iteration                 |                        93 |
| Gradient Steps            |                      9300 |
| Average Train Loss        |       0.34740425701281424 |
| Start Train Loss          |      0.012205812148749828 |
| Final Train Loss          |                       0.0 |
| Test Loss                 |      0.046652624820383154 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                     0.945 |
| Time Training             |         4.410045862197876 |
| Time Testing              |        0.4232938289642334 |


100%|██████████| 100/100 [00:04<00:00, 22.66it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.45it/s]

| Iteration                 |                        94 |
| Gradient Steps            |                      9400 |
| Average Train Loss        |        0.7438768124060985 |
| Start Train Loss          |     7.271726644830778e-06 |
| Final Train Loss          |     2.104020313709043e-05 |
| Test Loss                 |         0.075654439924773 |
| Test Accuracy             |        0.9800000000000003 |
| Train Accuracy            |                     0.895 |
| Time Training             |        4.4184489250183105 |
| Time Testing              |       0.40245914459228516 |


100%|██████████| 100/100 [00:04<00:00, 22.58it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.91it/s]

| Iteration                 |                        95 |
| Gradient Steps            |                      9500 |
| Average Train Loss        |       0.40897107751048906 |
| Start Train Loss          |        1.1471229791641235 |
| Final Train Loss          |     7.652932981727645e-05 |
| Test Loss                 |        0.2943024285418318 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.885 |
| Time Training             |         4.433733701705933 |
| Time Testing              |         0.407214879989624 |


100%|██████████| 100/100 [00:04<00:00, 22.42it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.60it/s]

| Iteration                 |                        96 |
| Gradient Steps            |                      9600 |
| Average Train Loss        |        0.3325029508730688 |
| Start Train Loss          |      0.010529769584536552 |
| Final Train Loss          |        0.2081926465034485 |
| Test Loss                 |       0.19720100465563267 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.92 |
| Time Training             |         4.469084024429321 |
| Time Testing              |       0.38383984565734863 |


100%|██████████| 100/100 [00:04<00:00, 22.83it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.58it/s]

| Iteration                 |                        97 |
| Gradient Steps            |                      9700 |
| Average Train Loss        |        0.3342731465484697 |
| Start Train Loss          |    1.3291659342939965e-05 |
| Final Train Loss          |       0.09864699840545654 |
| Test Loss                 |       0.15532133649472984 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                     0.925 |
| Time Training             |         4.392407178878784 |
| Time Testing              |        0.4091033935546875 |


100%|██████████| 100/100 [00:04<00:00, 21.90it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.83it/s]

| Iteration                 |                        98 |
| Gradient Steps            |                      9800 |
| Average Train Loss        |       0.46903386298609057 |
| Start Train Loss          |        1.3900902271270752 |
| Final Train Loss          |         2.858020782470703 |
| Test Loss                 |       0.29026878085738805 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                     0.915 |
| Time Training             |         4.581238746643066 |
| Time Testing              |        0.4042823314666748 |


100%|██████████| 100/100 [00:04<00:00, 22.71it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.57it/s]

| Iteration                 |                        99 |
| Gradient Steps            |                      9900 |
| Average Train Loss        |       0.22964208085558607 |
| Start Train Loss          |     3.182820728397928e-05 |
| Final Train Loss          |     1.925192918861285e-05 |
| Test Loss                 |        0.5659510092088569 |
| Test Accuracy             |        0.8800000000000002 |
| Train Accuracy            |                      0.94 |
| Time Training             |          4.41659140586853 |
| Time Testing              |       0.39516186714172363 |


100%|██████████| 100/100 [00:04<00:00, 22.66it/s]
  3%|▎         | 3/100 [00:00<00:04, 21.51it/s]

| Iteration                 |                       100 |
| Gradient Steps            |                     10000 |
| Average Train Loss        |        0.5814004851099025 |
| Start Train Loss          |         6.403341293334961 |
| Final Train Loss          |    0.00026570665067993104 |
| Test Loss                 |       0.29057893592536177 |
| Test Accuracy             |        0.9200000000000003 |
| Train Accuracy            |                      0.91 |
| Time Training             |         4.425164461135864 |
| Time Testing              |       0.40458083152770996 |


100%|██████████| 100/100 [00:04<00:00, 22.89it/s]
  2%|▏         | 2/100 [00:00<00:05, 19.38it/s]

| Iteration                 |                       101 |
| Gradient Steps            |                     10100 |
| Average Train Loss        |        0.2900316813797785 |
| Start Train Loss          |       0.19170323014259338 |
| Final Train Loss          |     0.0029106701258569956 |
| Test Loss                 |       0.17946719744298134 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.94 |
| Time Training             |         4.378841161727905 |
| Time Testing              |        0.4018735885620117 |


100%|██████████| 100/100 [00:04<00:00, 22.62it/s]
  3%|▎         | 3/100 [00:00<00:04, 24.10it/s]

| Iteration                 |                       102 |
| Gradient Steps            |                     10200 |
| Average Train Loss        |       0.23089245793127133 |
| Start Train Loss          |        2.3353772163391113 |
| Final Train Loss          |        1.7784005403518677 |
| Test Loss                 |      0.009148068680796512 |
| Test Accuracy             |        1.0000000000000002 |
| Train Accuracy            |                     0.935 |
| Time Training             |         4.424649000167847 |
| Time Testing              |        0.4058396816253662 |


100%|██████████| 100/100 [00:04<00:00, 22.77it/s]
  3%|▎         | 3/100 [00:00<00:04, 24.02it/s]

| Iteration                 |                       103 |
| Gradient Steps            |                     10300 |
| Average Train Loss        |         0.547860745126045 |
| Start Train Loss          |    4.4701999286189675e-05 |
| Final Train Loss          |     1.615261317056138e-05 |
| Test Loss                 |       0.13030551195368023 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.89 |
| Time Training             |         4.400290012359619 |
| Time Testing              |        0.3976626396179199 |


100%|██████████| 100/100 [00:04<00:00, 22.87it/s]
  3%|▎         | 3/100 [00:00<00:04, 20.55it/s]

| Iteration                 |                       104 |
| Gradient Steps            |                     10400 |
| Average Train Loss        |        0.3113180801564055 |
| Start Train Loss          |       0.09683377295732498 |
| Final Train Loss          |     0.0014135520905256271 |
| Test Loss                 |         1.148135194273232 |
| Test Accuracy             |        0.8000000000000003 |
| Train Accuracy            |                     0.915 |
| Time Training             |         4.380050182342529 |
| Time Testing              |        0.4159407615661621 |


100%|██████████| 100/100 [00:04<00:00, 22.28it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.05it/s]

| Iteration                 |                       105 |
| Gradient Steps            |                     10500 |
| Average Train Loss        |        0.4480824181537269 |
| Start Train Loss          |         2.731928825378418 |
| Final Train Loss          |        1.7955360412597656 |
| Test Loss                 |       0.09193623524780266 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                      0.93 |
| Time Training             |         4.495972156524658 |
| Time Testing              |        0.3997054100036621 |


100%|██████████| 100/100 [00:04<00:00, 22.39it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.44it/s]

| Iteration                 |                       106 |
| Gradient Steps            |                     10600 |
| Average Train Loss        |        0.4351338304575556 |
| Start Train Loss          |     9.119432434090413e-06 |
| Final Train Loss          |    1.5914189134491608e-05 |
| Test Loss                 |       0.25957292567387297 |
| Test Accuracy             |        0.9600000000000003 |
| Train Accuracy            |                     0.935 |
| Time Training             |         4.477396726608276 |
| Time Testing              |       0.39334726333618164 |


100%|██████████| 100/100 [00:04<00:00, 22.55it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.55it/s]

| Iteration                 |                       107 |
| Gradient Steps            |                     10700 |
| Average Train Loss        |       0.20251869225037847 |
| Start Train Loss          |     0.0002608810900710523 |
| Final Train Loss          |        0.6643362045288086 |
| Test Loss                 |        0.6608300408751746 |
| Test Accuracy             |        0.8800000000000002 |
| Train Accuracy            |                     0.945 |
| Time Training             |         4.443982124328613 |
| Time Testing              |       0.42517781257629395 |


100%|██████████| 100/100 [00:04<00:00, 22.53it/s]
  3%|▎         | 3/100 [00:00<00:04, 22.80it/s]

| Iteration                 |                       108 |
| Gradient Steps            |                     10800 |
| Average Train Loss        |        0.5009234432289557 |
| Start Train Loss          |      0.001320524257607758 |
| Final Train Loss          |       0.03442767262458801 |
| Test Loss                 |       0.26012359770513926 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                     0.905 |
| Time Training             |         4.446704387664795 |
| Time Testing              |        0.4001138210296631 |


100%|██████████| 100/100 [00:04<00:00, 22.32it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.11it/s]

| Iteration                 |                       109 |
| Gradient Steps            |                     10900 |
| Average Train Loss        |       0.25513667440622334 |
| Start Train Loss          |     9.547753870720044e-05 |
| Final Train Loss          |     0.0007010552217252553 |
| Test Loss                 |       0.25713524132383353 |
| Test Accuracy             |        0.9400000000000003 |
| Train Accuracy            |                      0.94 |
| Time Training             |         4.485480070114136 |
| Time Testing              |        0.4033207893371582 |


100%|██████████| 100/100 [00:04<00:00, 22.56it/s]


| Iteration                 |                       110 |
| Gradient Steps            |                     11000 |
| Average Train Loss        |       0.41023965111255534 |
| Start Train Loss          |      0.008073053322732449 |
| Final Train Loss          |     0.0015285469125956297 |
| Test Loss                 |        0.7815875313483022 |
| Test Accuracy             |        0.9000000000000002 |
| Train Accuracy            |                      0.92 |
| Time Training             |         4.437881946563721 |
| Time Testing              |       0.42551517486572266 |
