# Dataloaders

In [1]:
# Importing libraries
import torchio as tio
import glob
import numpy as np
import random
import os

from collections import OrderedDict
from pathlib import Path

from tqdm import tqdm
import time

import torchio as tio
from torchio.transforms import (RescaleIntensity,RandomFlip,Compose, HistogramStandardization, RandomAffine, RandomNoise, ToCanonical)

from torch.utils.data import DataLoader
import torch
import torch.nn as nn

import matplotlib.pyplot as plt

import pickle

In [2]:
# Define dictionary describing assignment of participants to the groups

# Groups
train_groups=['ABIDE','Athletes','HCP','COBRE','Leipzig']
test_dev_groups=['UoN']
test_groups=['CHIASM']
'''
# CHIASM dobry
# ABIDE dobry
# Athletes dobry
# COBRE dobry
# HCP dobry
# Leipzig good
# MCIC złe
# UoN dobry

# Splits
train_split = 0.85
dev_split = 0.15
test_split = 0.0

# Dictionary with study design
design = {}

design['train']={}
design['dev_train']={}
design['dev_test']={}
design['test']={}

design['train']['all']={}
design['dev_train']['all']={}
design['dev_test']['all']={}
design['test']['all']={}

# Training data
for group in train_groups:
    
    #design['dev_train'][group]={}
    #design['test'][group]={}

    # Idices of all subjects
    ids=[path.split('/')[-2] for path in glob.glob('../../1_Data/1_Input/'+group+'/*/tmp_brain_mask.nii.gz')] 
    # Randomize order
    random.shuffle(ids) 
    # Find split ratios
    train_idx = np.int(np.floor(len(ids)*train_split))
    dev_idx = np.int(np.floor(len(ids)*(train_split+dev_split)))
    
    for i in range(len(ids)):
        
        path_to_folder='../../1_Data/1_Input/'+group+'/'+ids[i]+'/'
        
        files={}
        files['brain']=path_to_folder+'t1w_1mm_iso.nii.gz'
        files['probs']=path_to_folder+'tmp_brain_mask.nii.gz'
        files['chiasm']=path_to_folder+'chiasm.nii.gz'
    
        if i+1<=train_idx:
            design['train']['all'][ids[i]]=files
        elif i+1 > dev_idx:
            design['test']['all'][ids[i]]=files
            #design['test'][group][ids[i]]=files
        else:
            design['dev_train']['all'][ids[i]]=files
            #design['dev_train'][group][ids[i]]=files
        
# Dev data
for group in test_dev_groups:

    # Idices of all subjects
    ids=[path.split('/')[-2] for path in glob.glob('../../1_Data/1_Input/'+group+'/*/tmp_brain_mask.nii.gz')] 
    
    for sub_id in ids:
        
        path_to_folder='../../1_Data/1_Input/'+group+'/'+sub_id+'/'
        
        files={}
        files['brain']=path_to_folder+'t1w_1mm_iso.nii.gz'
        files['probs']=path_to_folder+'tmp_brain_mask.nii.gz'
        files['chiasm']=path_to_folder+'chiasm.nii.gz'
        
        design['dev_test']['all'][sub_id]=files
    
# Test data
for group in test_groups:
    
    #design['test'][group]={}

    # Idices of all subjects
    ids=[path.split('/')[-2] for path in glob.glob('../../1_Data/1_Input/'+group+'/*/tmp_brain_mask.nii.gz')] 
    
    for sub_id in ids:
        
        path_to_folder='../../1_Data/1_Input/'+group+'/'+sub_id+'/'
        
        files={}
        files['brain']=path_to_folder+'t1w_1mm_iso.nii.gz'
        files['probs']=path_to_folder+'tmp_brain_mask.nii.gz'
        files['chiasm']=path_to_folder+'chiasm.nii.gz'
        
        design['test']['all'][sub_id]=files 
        #design['test'][group][sub_id]=files  
        '''

"\n# CHIASM dobry\n# ABIDE dobry\n# Athletes dobry\n# COBRE dobry\n# HCP dobry\n# Leipzig good\n# MCIC złe\n# UoN dobry\n\n# Splits\ntrain_split = 0.85\ndev_split = 0.15\ntest_split = 0.0\n\n# Dictionary with study design\ndesign = {}\n\ndesign['train']={}\ndesign['dev_train']={}\ndesign['dev_test']={}\ndesign['test']={}\n\ndesign['train']['all']={}\ndesign['dev_train']['all']={}\ndesign['dev_test']['all']={}\ndesign['test']['all']={}\n\n# Training data\nfor group in train_groups:\n    \n    #design['dev_train'][group]={}\n    #design['test'][group]={}\n\n    # Idices of all subjects\n    ids=[path.split('/')[-2] for path in glob.glob('../../1_Data/1_Input/'+group+'/*/tmp_brain_mask.nii.gz')] \n    # Randomize order\n    random.shuffle(ids) \n    # Find split ratios\n    train_idx = np.int(np.floor(len(ids)*train_split))\n    dev_idx = np.int(np.floor(len(ids)*(train_split+dev_split)))\n    \n    for i in range(len(ids)):\n        \n        path_to_folder='../../1_Data/1_Input/'+group+

In [3]:
# Save the dictionary
#with open('study_design.pkl', 'wb') as f:
#    pickle.dump(design, f)

# Load the dictionary
with open('study_design.pkl', 'rb') as f:
    design = pickle.load(f)

In [4]:
# Dictionary with all images
subjects_list={}

for group in design.keys():
    subjects_list[group]={}
    
    for dataset in design[group].keys():
        subjects_list[group][dataset]= [tio.Subject(t1=tio.Image(design[group][dataset][sub]['brain'], type=tio.INTENSITY),
                            probs = tio.Image(design[group][dataset][sub]['probs'], type = tio.INTENSITY)) for sub in design[group][dataset].keys()]

In [5]:
# Rescale
rescale = RescaleIntensity((0,1))
# Flip
flip = RandomFlip((0,1,2), flip_probability=0.5, p=0.25)
# Affine transformations
#affine = RandomAffine(degrees=30)

# Composing transforms - rescaling is mandatory, training data is subjected to a range of additional augmentations
transform_train = Compose([rescale, flip]) # leaving out standardization for now
transform_dev = Compose([rescale]) # leaving out standardization for now

In [6]:
# Torchio's (Pytorch's) Dataset
data = {}

for group in subjects_list.keys():
    data[group]={}
    
    for dataset in subjects_list[group]:
        
        if group == 'train':
            data[group][dataset]=tio.SubjectsDataset(subjects_list[group][dataset], transform=transform_train)
        else:
            data[group][dataset]=tio.SubjectsDataset(subjects_list[group][dataset], transform=transform_dev)

In [7]:
# Sampler
patch_size = (24,24,8)
queue_length = 500
samples_per_volume = 5

sampler = tio.data.WeightedSampler(patch_size,'probs')

In [8]:
# Dataloader

dataloader = {}

for group in data.keys():
    dataloader[group]={}
    
    for dataset in data[group]:
        
        dataloader[group][dataset]=DataLoader(tio.Queue(data[group][dataset], queue_length, samples_per_volume, sampler, num_workers=6, shuffle_subjects=True, shuffle_patches=True), batch_size = 25, num_workers=0)


In [9]:
# Testing
'''
num_epochs = 1

model = torch.nn.Identity()

for epoch_index in range(num_epochs):
    for patches_batch in dataloader['dev_train']['all']:
        #print(patches_batch)
        inputs = patches_batch['t1'][tio.DATA]  # key 't1' is in subject
        targets = patches_batch['t1'][tio.DATA]  # key 'brain' is in subject
        logits = model(inputs)  # model being an instance of torch.nn.Module
'''

"\nnum_epochs = 1\n\nmodel = torch.nn.Identity()\n\nfor epoch_index in range(num_epochs):\n    for patches_batch in dataloader['dev_train']['all']:\n        #print(patches_batch)\n        inputs = patches_batch['t1'][tio.DATA]  # key 't1' is in subject\n        targets = patches_batch['t1'][tio.DATA]  # key 'brain' is in subject\n        logits = model(inputs)  # model being an instance of torch.nn.Module\n"

In [10]:
#inputs.shape
'''
fig = plt.figure(figsize=(20, 10))

for i in range(inputs.shape[0]):
    plt.subplot(5,8,i+1)
    plt.imshow(inputs[i,0,:,:,5],cmap='gray');
    
plt.show()
'''

"\nfig = plt.figure(figsize=(20, 10))\n\nfor i in range(inputs.shape[0]):\n    plt.subplot(5,8,i+1)\n    plt.imshow(inputs[i,0,:,:,5],cmap='gray');\n    \nplt.show()\n"

In [11]:
'''import torchio as tio
t1 = tio.ScalarImage('T1w')
t2 = tio.ScalarImage('T2w')
subject = tio.Subject(T1w=t1, T2w=t2)
cp = tio.CropOrPad((512, 512, 408))
subject = tio.Subject(T1w=cp(t1), T2w=cp(t2))
subject.plot(reorient=False)
'''

"import torchio as tio\nt1 = tio.ScalarImage('T1w')\nt2 = tio.ScalarImage('T2w')\nsubject = tio.Subject(T1w=t1, T2w=t2)\ncp = tio.CropOrPad((512, 512, 408))\nsubject = tio.Subject(T1w=cp(t1), T2w=cp(t2))\nsubject.plot(reorient=False)\n"

# Network and parameters

In [12]:
# Try setting CUDA if possible
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu") 
    
print(device)

cuda


In [13]:
# Cropped U-Net copied from Overfitting Model

class UNet(nn.Module):
    
    def __init__(self, in_channels=1, out_channels=1, init_features=10, scaling=2):
        super(UNet, self).__init__()
                
        # Encoding layers
        self.encoder1 = self.unet_block(in_channels, init_features, "enc1")
        self.pool1 = nn.AvgPool3d(kernel_size=2, stride=2, padding=0)
        self.encoder2 = self.unet_block(init_features, init_features*scaling, name='enc2')
        self.pool2 = nn.AvgPool3d(kernel_size=2, stride=2, padding=0)

        # Bottleneck layer
        self.bottleneck = self.unet_block(init_features*scaling, init_features*scaling**2, name='bottleneck')
        
        # Decoding layers (where merge with prevois encoding layers occurs)        
        self.upconv2 = nn.ConvTranspose3d(init_features*scaling**2, init_features*scaling, kernel_size=2, stride=2)
        self.decoder2 = self.unet_block(init_features*scaling, init_features*scaling, name='dec2')
                
        self.upconv1 = nn.ConvTranspose3d(init_features*scaling, init_features, kernel_size=2, stride=2)
        self.decoder1 = self.unet_block(init_features, init_features, name='dec1')
        
        # Final convolution - output equals number of output channels
        self.conv = nn.Conv3d(init_features, out_channels, kernel_size=1) 
        
    def forward(self,x):
        
        # Encoding
        enc1 = self.encoder1(x)
        enc2 = self.encoder2(self.pool1(enc1))

        # Bottleneck
        bottleneck = self.bottleneck(self.pool2(enc2))

        # Upconvolving, concatenating data from respective encoding phase and executing UNet block
        dec2 = self.upconv2(bottleneck)
        dec2 = self.decoder2(dec2)
        dec1 = self.upconv1(dec2)
        dec1 = self.decoder1(dec1)
        
        out_conv = self.conv(dec1)
        
        return torch.sigmoid(out_conv)
    
    def unet_block(self, in_channels, features, name):
        
        return nn.Sequential(OrderedDict([(name+'conv1',nn.Conv3d(in_channels=in_channels, out_channels=features, kernel_size=3, padding=1, bias=False)),
                             (name+'bnorm1', nn.BatchNorm3d(num_features=features)),
                             (name+'relu1', nn.ReLU(inplace=True)),
                             (name+'conv2', nn.Conv3d(in_channels=features, out_channels=features, kernel_size=3, padding=1, bias=False)),
                             (name+'bnorm2', nn.BatchNorm3d(num_features=features)),
                             (name+'relu2', nn.ReLU(inplace=True))])
                            )

    def output_latent_representations(self,x):
        
        print(x.shape)

        # Encoding
        enc1 = self.encoder1(x)
        enc2 = self.encoder2(self.pool1(enc1))

        # Bottleneck
        bottleneck = self.bottleneck(self.pool2(enc2))
        
        print(bottleneck.shape)
        
        return bottleneck

In [14]:
#unet = UNet(1,1,2,2) # Size of latent representation = (1/64) * init_features * scaling**2
#unet.to(device)

In [15]:
#print(sum(p.numel() for p in unet.parameters() if p.requires_grad))

In [16]:
# try processing
'''
outputs = unet(inputs.to(device))
outputs = outputs.cpu().detach().numpy()

fig = plt.figure(figsize=(20, 10))

for i in range(outputs.shape[0]):
    plt.subplot(10,10,i+1)
    plt.imshow(outputs[i,0,:,:,5],cmap='gray');
    
plt.show()
'''

"\noutputs = unet(inputs.to(device))\noutputs = outputs.cpu().detach().numpy()\n\nfig = plt.figure(figsize=(20, 10))\n\nfor i in range(outputs.shape[0]):\n    plt.subplot(10,10,i+1)\n    plt.imshow(outputs[i,0,:,:,5],cmap='gray');\n    \nplt.show()\n"

In [17]:
# Criterion
#criterion = DiceLoss()
criterion = nn.MSELoss()

# Number of epochs
n_epochs = 25

In [18]:
#outcome = criterion(inputs, torch.Tensor(outputs))
#print(outcome)

# Training

In [19]:
# Function returning trained model
def train_network(n_epochs, dataloaders, model, optimizer, criterion, device, save_path):
    
    track_train_loss = []
    track_dev_train_loss = []
    track_dev_test_loss = []
    
    valid_loss_min = np.Inf
    
    model.to(device)
        
    for epoch in tqdm(range(1, n_epochs+1)):
        
        # Initialize loss monitoring variables
        train_loss = 0.0
        dev_train_loss = 0.0
        dev_test_loss = 0.0
        
        i=0
        k=0
        j=0
        
        start = time.time()
        
        # Training
        model.train()
        
        for batch in dataloaders['train']['all']:
            
            data = batch['t1']['data'].to(device)
            
            optimizer.zero_grad()
            
            output = model(data)
            loss = criterion(output, data)
            loss.backward()
            
            optimizer.step()
            
            train_loss += loss.item()
            i+=1
            
        track_train_loss.append(train_loss/i)
        
        # Validation on two datasets
        model.eval()
        
        for batch in dataloaders['dev_train']['all']:
            
            data = batch['t1']['data'].to(device)
            
            with torch.no_grad():
                
                output = model(data)
                loss = criterion(output,data)
                
                dev_train_loss += loss.item()
                j+=1
                
        track_dev_train_loss.append(dev_train_loss/j)
        
        
        for batch in dataloaders['dev_test']['all']:
            
            data = batch['t1']['data'].to(device)
            
            with torch.no_grad():
                
                output = model(data)
                loss = criterion(output,data)
                
                dev_test_loss += loss.item()
                k+=1
                
        track_dev_test_loss.append(dev_test_loss/k)
        
        # Print summary of epoch
        duration = time.time() - start

        print('END OF EPOCH: {} \tTraining loss per batch: {:.6f}\tTraining_dev loss per batch: {:.6f}\tTest_dev loss per batch: {:.6f}'.format(epoch, train_loss/i, dev_train_loss/j, dev_test_loss/k))
       
        
        ## Save the model if reached min validation loss
        if dev_train_loss + dev_test_loss < valid_loss_min:
            valid_loss_min = dev_train_loss + dev_test_loss
            torch.save(model.state_dict(),save_path+'optimal_weights')
                        
    # return trained model
    return track_train_loss, track_dev_train_loss, track_dev_test_loss         

In [20]:
#model_parameters=[[64,1],[32,1],[16,2],[16,1],[8,2],[8,1],[4,4],[4,2],[4,1],[2,4],[2,2],[2,1],[1,8],[1,4],[1,2],[1,1]]
model_parameters=[[2,4],[16,1],[4,2],[2,2],[8,2],[4,4],[64,1],[1,8]]

folder='../../1_Data/2_Trained_AE/'

for parameters in model_parameters:
    
    print(parameters)
        
    # Initialize the proper model
    unet = UNet(1,1,parameters[0],parameters[1])
    
    # Optimizer
    optimizer = torch.optim.Adam(params=unet.parameters(), lr=0.005)
    
    # Create output folder
    data_folder = folder+'/'+str(parameters[0])+'_'+str(parameters[1])+'/'
    os.makedirs(data_folder, exist_ok=True)
    
    # Train & save weights
    train_loss, dev_train_loss, dev_test_loss = train_network(n_epochs, dataloader, unet, optimizer, criterion, device, data_folder)
    
    # Save losses
    with open(data_folder+'train_loss.pkl', 'wb') as f:
        pickle.dump(train_loss, f)
        
    with open(data_folder+'dev_train_loss.pkl', 'wb') as f:
        pickle.dump(dev_train_loss, f)
        
    with open(data_folder+'dev_test_loss.pkl', 'wb') as f:
        pickle.dump(dev_test_loss, f)



[2, 4]


  4%|▍         | 1/25 [30:41<12:16:35, 1841.48s/it]

END OF EPOCH: 1 	Training loss per batch: 0.019672	Training_dev loss per batch: 0.006682	Test_dev loss per batch: 0.005112


  8%|▊         | 2/25 [58:35<11:26:37, 1791.19s/it]

END OF EPOCH: 2 	Training loss per batch: 0.006365	Training_dev loss per batch: 0.004665	Test_dev loss per batch: 0.004465


 12%|█▏        | 3/25 [1:26:43<10:45:26, 1760.29s/it]

END OF EPOCH: 3 	Training loss per batch: 0.005326	Training_dev loss per batch: 0.004054	Test_dev loss per batch: 0.003704


 16%|█▌        | 4/25 [1:53:50<10:02:05, 1720.28s/it]

END OF EPOCH: 4 	Training loss per batch: 0.004953	Training_dev loss per batch: 0.004594	Test_dev loss per batch: 0.004208
END OF EPOCH: 5 	Training loss per batch: 0.004605	Training_dev loss per batch: 0.003638	Test_dev loss per batch: 0.003112


 24%|██▍       | 6/25 [2:49:32<8:57:49, 1698.38s/it] 

END OF EPOCH: 6 	Training loss per batch: 0.004396	Training_dev loss per batch: 0.003294	Test_dev loss per batch: 0.002827


 28%|██▊       | 7/25 [3:18:34<8:33:24, 1711.39s/it]

END OF EPOCH: 7 	Training loss per batch: 0.003750	Training_dev loss per batch: 0.002728	Test_dev loss per batch: 0.001148


 32%|███▏      | 8/25 [3:49:22<8:16:30, 1752.41s/it]

END OF EPOCH: 8 	Training loss per batch: 0.003704	Training_dev loss per batch: 0.002681	Test_dev loss per batch: 0.000914


 36%|███▌      | 9/25 [4:19:34<7:52:04, 1770.30s/it]

END OF EPOCH: 9 	Training loss per batch: 0.003493	Training_dev loss per batch: 0.002556	Test_dev loss per batch: 0.000686


 40%|████      | 10/25 [4:48:19<7:19:08, 1756.58s/it]

END OF EPOCH: 10 	Training loss per batch: 0.003411	Training_dev loss per batch: 0.002545	Test_dev loss per batch: 0.000801
END OF EPOCH: 11 	Training loss per batch: 0.003107	Training_dev loss per batch: 0.002235	Test_dev loss per batch: 0.000648


 48%|████▊     | 12/25 [5:47:32<6:22:14, 1764.20s/it]

END OF EPOCH: 12 	Training loss per batch: 0.003327	Training_dev loss per batch: 0.004836	Test_dev loss per batch: 0.000831


 52%|█████▏    | 13/25 [6:16:11<5:50:06, 1750.53s/it]

END OF EPOCH: 13 	Training loss per batch: 0.003021	Training_dev loss per batch: 0.002318	Test_dev loss per batch: 0.000655


 56%|█████▌    | 14/25 [6:43:33<5:14:58, 1718.02s/it]

END OF EPOCH: 14 	Training loss per batch: 0.003089	Training_dev loss per batch: 0.002257	Test_dev loss per batch: 0.000541
END OF EPOCH: 15 	Training loss per batch: 0.003017	Training_dev loss per batch: 0.002173	Test_dev loss per batch: 0.000700


 64%|██████▍   | 16/25 [7:40:16<4:16:52, 1712.54s/it]

END OF EPOCH: 16 	Training loss per batch: 0.003068	Training_dev loss per batch: 0.002973	Test_dev loss per batch: 0.000573


 68%|██████▊   | 17/25 [8:09:08<3:49:06, 1718.35s/it]

END OF EPOCH: 17 	Training loss per batch: 0.003133	Training_dev loss per batch: 0.002522	Test_dev loss per batch: 0.000635


 72%|███████▏  | 18/25 [8:37:28<3:19:48, 1712.67s/it]

END OF EPOCH: 18 	Training loss per batch: 0.003053	Training_dev loss per batch: 0.002678	Test_dev loss per batch: 0.000736
END OF EPOCH: 19 	Training loss per batch: 0.002974	Training_dev loss per batch: 0.001954	Test_dev loss per batch: 0.000563


 80%|████████  | 20/25 [9:31:36<2:18:54, 1666.84s/it]

END OF EPOCH: 20 	Training loss per batch: 0.002795	Training_dev loss per batch: 0.002270	Test_dev loss per batch: 0.000595


 84%|████████▍ | 21/25 [9:58:50<1:50:27, 1656.87s/it]

END OF EPOCH: 21 	Training loss per batch: 0.002908	Training_dev loss per batch: 0.001997	Test_dev loss per batch: 0.000514


 88%|████████▊ | 22/25 [10:26:50<1:23:11, 1663.82s/it]

END OF EPOCH: 22 	Training loss per batch: 0.003055	Training_dev loss per batch: 0.002640	Test_dev loss per batch: 0.001298


 92%|█████████▏| 23/25 [10:54:46<55:34, 1667.46s/it]  

END OF EPOCH: 23 	Training loss per batch: 0.002984	Training_dev loss per batch: 0.002192	Test_dev loss per batch: 0.000521


 96%|█████████▌| 24/25 [11:22:06<27:39, 1659.35s/it]

END OF EPOCH: 24 	Training loss per batch: 0.002873	Training_dev loss per batch: 0.002336	Test_dev loss per batch: 0.000742


100%|██████████| 25/25 [11:48:13<00:00, 1699.73s/it]

END OF EPOCH: 25 	Training loss per batch: 0.002953	Training_dev loss per batch: 0.002306	Test_dev loss per batch: 0.000649
[16, 1]



  0%|          | 0/25 [00:00<?, ?it/s]

END OF EPOCH: 1 	Training loss per batch: 0.008059	Training_dev loss per batch: 0.003544	Test_dev loss per batch: 0.000874


  8%|▊         | 2/25 [55:58<10:43:38, 1679.06s/it]

END OF EPOCH: 2 	Training loss per batch: 0.004726	Training_dev loss per batch: 0.005412	Test_dev loss per batch: 0.001281


 12%|█▏        | 3/25 [1:22:46<10:07:50, 1657.74s/it]

END OF EPOCH: 3 	Training loss per batch: 0.004387	Training_dev loss per batch: 0.004115	Test_dev loss per batch: 0.001328


 16%|█▌        | 4/25 [1:50:16<9:39:24, 1655.44s/it] 

END OF EPOCH: 4 	Training loss per batch: 0.004119	Training_dev loss per batch: 0.004143	Test_dev loss per batch: 0.001449


 20%|██        | 5/25 [2:17:19<9:08:37, 1645.90s/it]

END OF EPOCH: 5 	Training loss per batch: 0.004088	Training_dev loss per batch: 0.003597	Test_dev loss per batch: 0.002288
END OF EPOCH: 6 	Training loss per batch: 0.003572	Training_dev loss per batch: 0.003157	Test_dev loss per batch: 0.001082


 28%|██▊       | 7/25 [3:12:38<8:14:56, 1649.80s/it]

END OF EPOCH: 7 	Training loss per batch: 0.003370	Training_dev loss per batch: 0.003049	Test_dev loss per batch: 0.001025


 32%|███▏      | 8/25 [3:40:21<7:48:34, 1653.77s/it]

END OF EPOCH: 8 	Training loss per batch: 0.003283	Training_dev loss per batch: 0.002556	Test_dev loss per batch: 0.000751


 36%|███▌      | 9/25 [4:07:43<7:19:59, 1650.00s/it]

END OF EPOCH: 9 	Training loss per batch: 0.003326	Training_dev loss per batch: 0.002645	Test_dev loss per batch: 0.000621
END OF EPOCH: 10 	Training loss per batch: 0.003077	Training_dev loss per batch: 0.002499	Test_dev loss per batch: 0.000990


 44%|████▍     | 11/25 [5:01:10<6:20:43, 1631.68s/it]

END OF EPOCH: 11 	Training loss per batch: 0.003124	Training_dev loss per batch: 0.002471	Test_dev loss per batch: 0.001148


 48%|████▊     | 12/25 [5:29:29<5:57:54, 1651.85s/it]

END OF EPOCH: 12 	Training loss per batch: 0.003105	Training_dev loss per batch: 0.002147	Test_dev loss per batch: 0.000568


 52%|█████▏    | 13/25 [5:56:34<5:28:44, 1643.68s/it]

END OF EPOCH: 13 	Training loss per batch: 0.003213	Training_dev loss per batch: 0.003187	Test_dev loss per batch: 0.000709


 56%|█████▌    | 14/25 [6:24:03<5:01:39, 1645.45s/it]

END OF EPOCH: 14 	Training loss per batch: 0.003090	Training_dev loss per batch: 0.002252	Test_dev loss per batch: 0.000753


 60%|██████    | 15/25 [6:51:01<4:32:52, 1637.25s/it]

END OF EPOCH: 15 	Training loss per batch: 0.003067	Training_dev loss per batch: 0.002564	Test_dev loss per batch: 0.000496
END OF EPOCH: 16 	Training loss per batch: 0.002766	Training_dev loss per batch: 0.002052	Test_dev loss per batch: 0.001059


 68%|██████▊   | 17/25 [7:45:24<3:37:43, 1632.90s/it]

END OF EPOCH: 17 	Training loss per batch: 0.002832	Training_dev loss per batch: 0.002021	Test_dev loss per batch: 0.000597


 72%|███████▏  | 18/25 [8:13:55<3:13:13, 1656.28s/it]

END OF EPOCH: 18 	Training loss per batch: 0.002798	Training_dev loss per batch: 0.002130	Test_dev loss per batch: 0.001192


 76%|███████▌  | 19/25 [8:41:31<2:45:38, 1656.36s/it]

END OF EPOCH: 19 	Training loss per batch: 0.002809	Training_dev loss per batch: 0.009669	Test_dev loss per batch: 0.001676
END OF EPOCH: 20 	Training loss per batch: 0.003002	Training_dev loss per batch: 0.001899	Test_dev loss per batch: 0.000571


 84%|████████▍ | 21/25 [9:38:17<1:52:49, 1692.32s/it]

END OF EPOCH: 21 	Training loss per batch: 0.002752	Training_dev loss per batch: 0.002042	Test_dev loss per batch: 0.000541


 88%|████████▊ | 22/25 [10:06:33<1:24:39, 1693.26s/it]

END OF EPOCH: 22 	Training loss per batch: 0.002775	Training_dev loss per batch: 0.002935	Test_dev loss per batch: 0.000452


 92%|█████████▏| 23/25 [10:33:53<55:54, 1677.36s/it]  

END OF EPOCH: 23 	Training loss per batch: 0.002648	Training_dev loss per batch: 0.002128	Test_dev loss per batch: 0.000869


 96%|█████████▌| 24/25 [11:00:21<27:30, 1650.57s/it]

END OF EPOCH: 24 	Training loss per batch: 0.002607	Training_dev loss per batch: 0.002079	Test_dev loss per batch: 0.000474


100%|██████████| 25/25 [11:27:00<00:00, 1648.84s/it]

END OF EPOCH: 25 	Training loss per batch: 0.002660	Training_dev loss per batch: 0.002553	Test_dev loss per batch: 0.000984
[4, 2]



  0%|          | 0/25 [00:00<?, ?it/s]

END OF EPOCH: 1 	Training loss per batch: 0.008943	Training_dev loss per batch: 0.003179	Test_dev loss per batch: 0.001378


  8%|▊         | 2/25 [55:29<10:39:10, 1667.42s/it]

END OF EPOCH: 2 	Training loss per batch: 0.004782	Training_dev loss per batch: 0.003319	Test_dev loss per batch: 0.001100


 12%|█▏        | 3/25 [1:23:16<10:11:18, 1667.19s/it]

END OF EPOCH: 3 	Training loss per batch: 0.004393	Training_dev loss per batch: 0.003549	Test_dev loss per batch: 0.001809


 16%|█▌        | 4/25 [1:50:39<9:41:01, 1660.09s/it] 

END OF EPOCH: 4 	Training loss per batch: 0.004059	Training_dev loss per batch: 0.003489	Test_dev loss per batch: 0.000802
END OF EPOCH: 5 	Training loss per batch: 0.004009	Training_dev loss per batch: 0.002459	Test_dev loss per batch: 0.000799


 24%|██▍       | 6/25 [2:44:39<8:39:05, 1639.22s/it]

END OF EPOCH: 6 	Training loss per batch: 0.003883	Training_dev loss per batch: 0.003195	Test_dev loss per batch: 0.000906


 28%|██▊       | 7/25 [3:12:59<8:17:13, 1657.42s/it]

END OF EPOCH: 7 	Training loss per batch: 0.003829	Training_dev loss per batch: 0.002811	Test_dev loss per batch: 0.000686


 32%|███▏      | 8/25 [3:40:31<7:49:05, 1655.64s/it]

END OF EPOCH: 8 	Training loss per batch: 0.003755	Training_dev loss per batch: 0.002483	Test_dev loss per batch: 0.001113


 36%|███▌      | 9/25 [4:07:03<7:16:27, 1636.73s/it]

END OF EPOCH: 9 	Training loss per batch: 0.003737	Training_dev loss per batch: 0.002864	Test_dev loss per batch: 0.000581
END OF EPOCH: 10 	Training loss per batch: 0.003613	Training_dev loss per batch: 0.002441	Test_dev loss per batch: 0.000591


 44%|████▍     | 11/25 [5:01:15<6:21:00, 1632.88s/it]

END OF EPOCH: 11 	Training loss per batch: 0.003365	Training_dev loss per batch: 0.002269	Test_dev loss per batch: 0.000748


 48%|████▊     | 12/25 [5:29:13<5:56:42, 1646.35s/it]

END OF EPOCH: 12 	Training loss per batch: 0.003233	Training_dev loss per batch: 0.002689	Test_dev loss per batch: 0.000679
END OF EPOCH: 13 	Training loss per batch: 0.003431	Training_dev loss per batch: 0.002261	Test_dev loss per batch: 0.000737


 56%|█████▌    | 14/25 [6:22:51<4:58:43, 1629.40s/it]

END OF EPOCH: 14 	Training loss per batch: 0.003340	Training_dev loss per batch: 0.002701	Test_dev loss per batch: 0.000744


 60%|██████    | 15/25 [6:49:47<4:30:53, 1625.32s/it]

END OF EPOCH: 15 	Training loss per batch: 0.003233	Training_dev loss per batch: 0.002441	Test_dev loss per batch: 0.001151


 64%|██████▍   | 16/25 [7:16:19<4:02:16, 1615.17s/it]

END OF EPOCH: 16 	Training loss per batch: 0.003082	Training_dev loss per batch: 0.037388	Test_dev loss per batch: 0.003384


 68%|██████▊   | 17/25 [7:44:03<3:37:17, 1629.75s/it]

END OF EPOCH: 17 	Training loss per batch: 0.003202	Training_dev loss per batch: 0.006544	Test_dev loss per batch: 0.000989


 72%|███████▏  | 18/25 [8:11:21<3:10:25, 1632.28s/it]

END OF EPOCH: 18 	Training loss per batch: 0.003239	Training_dev loss per batch: 0.003244	Test_dev loss per batch: 0.001095
END OF EPOCH: 19 	Training loss per batch: 0.002919	Training_dev loss per batch: 0.002177	Test_dev loss per batch: 0.000662


 80%|████████  | 20/25 [9:05:10<2:15:21, 1624.30s/it]

END OF EPOCH: 20 	Training loss per batch: 0.002870	Training_dev loss per batch: 0.002231	Test_dev loss per batch: 0.000948


 84%|████████▍ | 21/25 [9:32:58<1:49:09, 1637.39s/it]

END OF EPOCH: 21 	Training loss per batch: 0.002935	Training_dev loss per batch: 0.003432	Test_dev loss per batch: 0.001134


 88%|████████▊ | 22/25 [10:00:47<1:22:20, 1646.72s/it]

END OF EPOCH: 22 	Training loss per batch: 0.002881	Training_dev loss per batch: 0.002488	Test_dev loss per batch: 0.000674
END OF EPOCH: 23 	Training loss per batch: 0.002669	Training_dev loss per batch: 0.002189	Test_dev loss per batch: 0.000528


 96%|█████████▌| 24/25 [10:54:56<27:17, 1637.96s/it]  

END OF EPOCH: 24 	Training loss per batch: 0.002710	Training_dev loss per batch: 0.002447	Test_dev loss per batch: 0.000753


100%|██████████| 25/25 [11:20:57<00:00, 1634.28s/it]

END OF EPOCH: 25 	Training loss per batch: 0.003057	Training_dev loss per batch: 0.002239	Test_dev loss per batch: 0.001987
[2, 2]



  0%|          | 0/25 [00:00<?, ?it/s]

END OF EPOCH: 1 	Training loss per batch: 0.012598	Training_dev loss per batch: 0.005848	Test_dev loss per batch: 0.001679


  8%|▊         | 2/25 [54:23<10:19:38, 1616.45s/it]

END OF EPOCH: 2 	Training loss per batch: 0.005998	Training_dev loss per batch: 0.005477	Test_dev loss per batch: 0.001433


 12%|█▏        | 3/25 [1:21:37<9:54:39, 1621.78s/it]

END OF EPOCH: 3 	Training loss per batch: 0.005635	Training_dev loss per batch: 0.004690	Test_dev loss per batch: 0.001214


 16%|█▌        | 4/25 [1:48:49<9:28:42, 1624.89s/it]

END OF EPOCH: 4 	Training loss per batch: 0.005004	Training_dev loss per batch: 0.005135	Test_dev loss per batch: 0.001375
END OF EPOCH: 5 	Training loss per batch: 0.005056	Training_dev loss per batch: 0.004290	Test_dev loss per batch: 0.001144


 24%|██▍       | 6/25 [2:42:00<8:31:35, 1615.55s/it]

END OF EPOCH: 6 	Training loss per batch: 0.004594	Training_dev loss per batch: 0.004446	Test_dev loss per batch: 0.002561


 28%|██▊       | 7/25 [3:09:54<8:09:55, 1633.10s/it]

END OF EPOCH: 7 	Training loss per batch: 0.004581	Training_dev loss per batch: 0.004129	Test_dev loss per batch: 0.002216
END OF EPOCH: 8 	Training loss per batch: 0.004821	Training_dev loss per batch: 0.003943	Test_dev loss per batch: 0.001361


 36%|███▌      | 9/25 [4:03:38<7:12:36, 1622.25s/it]

END OF EPOCH: 9 	Training loss per batch: 0.004278	Training_dev loss per batch: 0.004418	Test_dev loss per batch: 0.000932
END OF EPOCH: 10 	Training loss per batch: 0.004287	Training_dev loss per batch: 0.003446	Test_dev loss per batch: 0.000894


 44%|████▍     | 11/25 [4:57:14<6:17:13, 1616.66s/it]

END OF EPOCH: 11 	Training loss per batch: 0.004241	Training_dev loss per batch: 0.003293	Test_dev loss per batch: 0.000891


 48%|████▊     | 12/25 [5:24:13<5:50:24, 1617.29s/it]

END OF EPOCH: 12 	Training loss per batch: 0.004199	Training_dev loss per batch: 0.003628	Test_dev loss per batch: 0.000893


 52%|█████▏    | 13/25 [5:51:54<5:26:02, 1630.23s/it]

END OF EPOCH: 13 	Training loss per batch: 0.004262	Training_dev loss per batch: 0.003341	Test_dev loss per batch: 0.001389


 56%|█████▌    | 14/25 [6:19:14<4:59:27, 1633.37s/it]

END OF EPOCH: 14 	Training loss per batch: 0.004267	Training_dev loss per batch: 0.003842	Test_dev loss per batch: 0.002724


 60%|██████    | 15/25 [6:45:18<4:28:44, 1612.46s/it]

END OF EPOCH: 15 	Training loss per batch: 0.004260	Training_dev loss per batch: 0.003371	Test_dev loss per batch: 0.000798


 64%|██████▍   | 16/25 [7:12:44<4:03:24, 1622.68s/it]

END OF EPOCH: 16 	Training loss per batch: 0.004189	Training_dev loss per batch: 0.003554	Test_dev loss per batch: 0.001142


 68%|██████▊   | 17/25 [7:40:40<3:38:29, 1638.63s/it]

END OF EPOCH: 17 	Training loss per batch: 0.003908	Training_dev loss per batch: 0.003600	Test_dev loss per batch: 0.001283


 72%|███████▏  | 18/25 [8:08:26<3:12:07, 1646.85s/it]

END OF EPOCH: 18 	Training loss per batch: 0.004000	Training_dev loss per batch: 0.003366	Test_dev loss per batch: 0.000834


 76%|███████▌  | 19/25 [8:34:34<2:42:18, 1623.04s/it]

END OF EPOCH: 19 	Training loss per batch: 0.004043	Training_dev loss per batch: 0.003302	Test_dev loss per batch: 0.000995
END OF EPOCH: 20 	Training loss per batch: 0.003978	Training_dev loss per batch: 0.003269	Test_dev loss per batch: 0.000946


 84%|████████▍ | 21/25 [9:28:24<1:47:52, 1618.03s/it]

END OF EPOCH: 21 	Training loss per batch: 0.004125	Training_dev loss per batch: 0.002980	Test_dev loss per batch: 0.000833


 88%|████████▊ | 22/25 [9:55:30<1:21:01, 1620.51s/it]

END OF EPOCH: 22 	Training loss per batch: 0.003997	Training_dev loss per batch: 0.003324	Test_dev loss per batch: 0.001058


 92%|█████████▏| 23/25 [10:23:05<54:21, 1630.90s/it] 

END OF EPOCH: 23 	Training loss per batch: 0.003862	Training_dev loss per batch: 0.003488	Test_dev loss per batch: 0.000835


 96%|█████████▌| 24/25 [10:50:26<27:13, 1633.92s/it]

END OF EPOCH: 24 	Training loss per batch: 0.003935	Training_dev loss per batch: 0.003242	Test_dev loss per batch: 0.001176


100%|██████████| 25/25 [11:16:22<00:00, 1623.32s/it]

END OF EPOCH: 25 	Training loss per batch: 0.003803	Training_dev loss per batch: 0.003428	Test_dev loss per batch: 0.001050
[8, 2]



  0%|          | 0/25 [00:00<?, ?it/s]

END OF EPOCH: 1 	Training loss per batch: 0.012381	Training_dev loss per batch: 0.011879	Test_dev loss per batch: 0.004082


  8%|▊         | 2/25 [55:34<10:39:46, 1668.97s/it]

END OF EPOCH: 2 	Training loss per batch: 0.004809	Training_dev loss per batch: 0.003854	Test_dev loss per batch: 0.002754


 12%|█▏        | 3/25 [1:23:31<10:12:51, 1671.43s/it]

END OF EPOCH: 3 	Training loss per batch: 0.004563	Training_dev loss per batch: 0.005315	Test_dev loss per batch: 0.003669
END OF EPOCH: 4 	Training loss per batch: 0.004351	Training_dev loss per batch: 0.002890	Test_dev loss per batch: 0.001024


 20%|██        | 5/25 [2:17:04<9:06:16, 1638.81s/it] 

END OF EPOCH: 5 	Training loss per batch: 0.003692	Training_dev loss per batch: 0.002624	Test_dev loss per batch: 0.000778


 24%|██▍       | 6/25 [2:44:18<8:38:25, 1637.12s/it]

END OF EPOCH: 6 	Training loss per batch: 0.003910	Training_dev loss per batch: 0.002935	Test_dev loss per batch: 0.001780
END OF EPOCH: 7 	Training loss per batch: 0.003483	Training_dev loss per batch: 0.002517	Test_dev loss per batch: 0.000795


 32%|███▏      | 8/25 [3:39:13<7:46:10, 1645.33s/it]

END OF EPOCH: 8 	Training loss per batch: 0.003495	Training_dev loss per batch: 0.007098	Test_dev loss per batch: 0.000888
END OF EPOCH: 9 	Training loss per batch: 0.003222	Training_dev loss per batch: 0.002248	Test_dev loss per batch: 0.001433


 40%|████      | 10/25 [4:33:06<6:46:57, 1627.86s/it]

END OF EPOCH: 10 	Training loss per batch: 0.003204	Training_dev loss per batch: 0.002310	Test_dev loss per batch: 0.000721


 44%|████▍     | 11/25 [4:59:48<6:18:00, 1620.07s/it]

END OF EPOCH: 11 	Training loss per batch: 0.003356	Training_dev loss per batch: 0.002006	Test_dev loss per batch: 0.000612


 48%|████▊     | 12/25 [5:27:49<5:54:57, 1638.29s/it]

END OF EPOCH: 12 	Training loss per batch: 0.003017	Training_dev loss per batch: 0.002029	Test_dev loss per batch: 0.000651


 52%|█████▏    | 13/25 [5:55:15<5:28:10, 1640.84s/it]

END OF EPOCH: 13 	Training loss per batch: 0.002971	Training_dev loss per batch: 0.002004	Test_dev loss per batch: 0.000691


 56%|█████▌    | 14/25 [6:21:44<4:57:56, 1625.12s/it]

END OF EPOCH: 14 	Training loss per batch: 0.002792	Training_dev loss per batch: 0.002032	Test_dev loss per batch: 0.000498


 60%|██████    | 15/25 [6:48:02<4:28:29, 1610.95s/it]

END OF EPOCH: 15 	Training loss per batch: 0.002839	Training_dev loss per batch: 0.002366	Test_dev loss per batch: 0.000791


 64%|██████▍   | 16/25 [7:15:15<4:02:39, 1617.78s/it]

END OF EPOCH: 16 	Training loss per batch: 0.002799	Training_dev loss per batch: 0.002155	Test_dev loss per batch: 0.000705
END OF EPOCH: 17 	Training loss per batch: 0.002970	Training_dev loss per batch: 0.001929	Test_dev loss per batch: 0.000682


 72%|███████▏  | 18/25 [8:08:54<3:07:56, 1610.93s/it]

END OF EPOCH: 18 	Training loss per batch: 0.002782	Training_dev loss per batch: 0.001961	Test_dev loss per batch: 0.000677


 76%|███████▌  | 19/25 [8:35:49<2:41:13, 1612.20s/it]

END OF EPOCH: 19 	Training loss per batch: 0.002693	Training_dev loss per batch: 0.002032	Test_dev loss per batch: 0.000739


 80%|████████  | 20/25 [9:02:26<2:13:57, 1607.59s/it]

END OF EPOCH: 20 	Training loss per batch: 0.002599	Training_dev loss per batch: 0.002282	Test_dev loss per batch: 0.001571


 84%|████████▍ | 21/25 [9:28:40<1:46:30, 1597.54s/it]

END OF EPOCH: 21 	Training loss per batch: 0.002614	Training_dev loss per batch: 0.002616	Test_dev loss per batch: 0.000650


 88%|████████▊ | 22/25 [9:56:17<1:20:45, 1615.33s/it]

END OF EPOCH: 22 	Training loss per batch: 0.002582	Training_dev loss per batch: 0.002055	Test_dev loss per batch: 0.000789
END OF EPOCH: 23 	Training loss per batch: 0.002718	Training_dev loss per batch: 0.001854	Test_dev loss per batch: 0.000517


 96%|█████████▌| 24/25 [10:50:47<27:04, 1624.98s/it] 

END OF EPOCH: 24 	Training loss per batch: 0.002472	Training_dev loss per batch: 0.001937	Test_dev loss per batch: 0.000464


100%|██████████| 25/25 [11:16:21<00:00, 1623.28s/it]

END OF EPOCH: 25 	Training loss per batch: 0.002359	Training_dev loss per batch: 0.001971	Test_dev loss per batch: 0.000517
[4, 4]



  0%|          | 0/25 [00:00<?, ?it/s]

END OF EPOCH: 1 	Training loss per batch: 0.011719	Training_dev loss per batch: 0.007038	Test_dev loss per batch: 0.002511


  8%|▊         | 2/25 [54:28<10:24:02, 1627.92s/it]

END OF EPOCH: 2 	Training loss per batch: 0.004782	Training_dev loss per batch: 0.003373	Test_dev loss per batch: 0.001325


 12%|█▏        | 3/25 [1:20:41<9:50:56, 1611.68s/it]

END OF EPOCH: 3 	Training loss per batch: 0.004172	Training_dev loss per batch: 0.005757	Test_dev loss per batch: 0.005255
END OF EPOCH: 4 	Training loss per batch: 0.004288	Training_dev loss per batch: 0.003055	Test_dev loss per batch: 0.000900


 20%|██        | 5/25 [2:13:03<8:49:34, 1588.74s/it]

END OF EPOCH: 5 	Training loss per batch: 0.003921	Training_dev loss per batch: 0.004553	Test_dev loss per batch: 0.004050


 24%|██▍       | 6/25 [2:40:15<8:27:14, 1601.82s/it]

END OF EPOCH: 6 	Training loss per batch: 0.003961	Training_dev loss per batch: 0.002647	Test_dev loss per batch: 0.000927


 28%|██▊       | 7/25 [3:07:07<8:01:28, 1604.93s/it]

END OF EPOCH: 7 	Training loss per batch: 0.003607	Training_dev loss per batch: 0.005697	Test_dev loss per batch: 0.007376


 32%|███▏      | 8/25 [3:34:13<7:36:30, 1611.23s/it]

END OF EPOCH: 8 	Training loss per batch: 0.003631	Training_dev loss per batch: 0.003170	Test_dev loss per batch: 0.000635
END OF EPOCH: 9 	Training loss per batch: 0.003530	Training_dev loss per batch: 0.002463	Test_dev loss per batch: 0.001132


 40%|████      | 10/25 [4:26:48<6:37:15, 1589.00s/it]

END OF EPOCH: 10 	Training loss per batch: 0.003382	Training_dev loss per batch: 0.003580	Test_dev loss per batch: 0.001500


 44%|████▍     | 11/25 [4:54:06<6:14:14, 1603.91s/it]

END OF EPOCH: 11 	Training loss per batch: 0.003377	Training_dev loss per batch: 0.002550	Test_dev loss per batch: 0.001122
END OF EPOCH: 12 	Training loss per batch: 0.003311	Training_dev loss per batch: 0.002294	Test_dev loss per batch: 0.000650


 52%|█████▏    | 13/25 [5:47:18<5:19:10, 1595.91s/it]

END OF EPOCH: 13 	Training loss per batch: 0.003248	Training_dev loss per batch: 0.003234	Test_dev loss per batch: 0.001556


 56%|█████▌    | 14/25 [6:13:39<4:51:47, 1591.56s/it]

END OF EPOCH: 14 	Training loss per batch: 0.003118	Training_dev loss per batch: 0.002519	Test_dev loss per batch: 0.001069


 60%|██████    | 15/25 [6:39:30<4:23:14, 1579.43s/it]

END OF EPOCH: 15 	Training loss per batch: 0.002906	Training_dev loss per batch: 0.002995	Test_dev loss per batch: 0.001288


 64%|██████▍   | 16/25 [7:06:17<3:58:08, 1587.60s/it]

END OF EPOCH: 16 	Training loss per batch: 0.003074	Training_dev loss per batch: 0.002591	Test_dev loss per batch: 0.000726


 68%|██████▊   | 17/25 [7:33:02<3:32:22, 1592.78s/it]

END OF EPOCH: 17 	Training loss per batch: 0.003047	Training_dev loss per batch: 0.002765	Test_dev loss per batch: 0.000653


 72%|███████▏  | 18/25 [7:59:31<3:05:42, 1591.75s/it]

END OF EPOCH: 18 	Training loss per batch: 0.003189	Training_dev loss per batch: 0.002331	Test_dev loss per batch: 0.001037


 76%|███████▌  | 19/25 [8:26:19<2:39:39, 1596.62s/it]

END OF EPOCH: 19 	Training loss per batch: 0.002998	Training_dev loss per batch: 0.003330	Test_dev loss per batch: 0.000830
END OF EPOCH: 20 	Training loss per batch: 0.003001	Training_dev loss per batch: 0.001999	Test_dev loss per batch: 0.000593


 84%|████████▍ | 21/25 [9:18:41<1:45:53, 1588.30s/it]

END OF EPOCH: 21 	Training loss per batch: 0.002710	Training_dev loss per batch: 0.002138	Test_dev loss per batch: 0.000620


 88%|████████▊ | 22/25 [9:45:51<1:20:01, 1600.56s/it]

END OF EPOCH: 22 	Training loss per batch: 0.002696	Training_dev loss per batch: 0.002184	Test_dev loss per batch: 0.000706


 92%|█████████▏| 23/25 [10:13:05<53:41, 1610.79s/it] 

END OF EPOCH: 23 	Training loss per batch: 0.003048	Training_dev loss per batch: 0.002058	Test_dev loss per batch: 0.001006


 96%|█████████▌| 24/25 [10:39:20<26:39, 1599.93s/it]

END OF EPOCH: 24 	Training loss per batch: 0.002899	Training_dev loss per batch: 0.002053	Test_dev loss per batch: 0.001172


100%|██████████| 25/25 [11:05:51<00:00, 1598.05s/it]

END OF EPOCH: 25 	Training loss per batch: 0.002780	Training_dev loss per batch: 0.003271	Test_dev loss per batch: 0.002742
[64, 1]



  0%|          | 0/25 [00:00<?, ?it/s]

END OF EPOCH: 1 	Training loss per batch: 0.010058	Training_dev loss per batch: 0.004914	Test_dev loss per batch: 0.002205


  8%|▊         | 2/25 [53:11<10:13:37, 1600.75s/it]

END OF EPOCH: 2 	Training loss per batch: 0.005674	Training_dev loss per batch: 0.004042	Test_dev loss per batch: 0.001229


 12%|█▏        | 3/25 [1:21:00<9:54:31, 1621.43s/it]

END OF EPOCH: 3 	Training loss per batch: 0.005421	Training_dev loss per batch: 0.003608	Test_dev loss per batch: 0.001113


 16%|█▌        | 4/25 [1:47:39<9:25:08, 1614.69s/it]

END OF EPOCH: 4 	Training loss per batch: 0.004797	Training_dev loss per batch: 0.019695	Test_dev loss per batch: 0.001060


 20%|██        | 5/25 [2:13:56<8:54:24, 1603.24s/it]

END OF EPOCH: 5 	Training loss per batch: 0.004972	Training_dev loss per batch: 0.004396	Test_dev loss per batch: 0.001639


 24%|██▍       | 6/25 [2:39:54<8:23:26, 1589.81s/it]

END OF EPOCH: 6 	Training loss per batch: 0.004843	Training_dev loss per batch: 0.003705	Test_dev loss per batch: 0.001391
END OF EPOCH: 7 	Training loss per batch: 0.004171	Training_dev loss per batch: 0.003220	Test_dev loss per batch: 0.000885


 32%|███▏      | 8/25 [3:33:28<7:32:33, 1597.25s/it]

END OF EPOCH: 8 	Training loss per batch: 0.004025	Training_dev loss per batch: 0.002610	Test_dev loss per batch: 0.000622


 36%|███▌      | 9/25 [3:59:19<7:02:11, 1583.23s/it]

END OF EPOCH: 9 	Training loss per batch: 0.003488	Training_dev loss per batch: 0.003050	Test_dev loss per batch: 0.002329


 40%|████      | 10/25 [4:25:23<6:34:24, 1577.62s/it]

END OF EPOCH: 10 	Training loss per batch: 0.003508	Training_dev loss per batch: 0.003160	Test_dev loss per batch: 0.000893


 44%|████▍     | 11/25 [4:51:56<6:09:11, 1582.27s/it]

END OF EPOCH: 11 	Training loss per batch: 0.003504	Training_dev loss per batch: 0.002757	Test_dev loss per batch: 0.001310


 48%|████▊     | 12/25 [5:19:25<5:47:08, 1602.21s/it]

END OF EPOCH: 12 	Training loss per batch: 0.003304	Training_dev loss per batch: 0.003862	Test_dev loss per batch: 0.000786


 52%|█████▏    | 13/25 [5:45:49<5:19:21, 1596.82s/it]

END OF EPOCH: 13 	Training loss per batch: 0.003194	Training_dev loss per batch: 0.004782	Test_dev loss per batch: 0.001083


 56%|█████▌    | 14/25 [6:12:30<4:52:58, 1598.08s/it]

END OF EPOCH: 14 	Training loss per batch: 0.003293	Training_dev loss per batch: 0.002768	Test_dev loss per batch: 0.000764
END OF EPOCH: 15 	Training loss per batch: 0.003050	Training_dev loss per batch: 0.002514	Test_dev loss per batch: 0.000720


 64%|██████▍   | 16/25 [7:05:29<3:58:55, 1592.85s/it]

END OF EPOCH: 16 	Training loss per batch: 0.003154	Training_dev loss per batch: 0.003046	Test_dev loss per batch: 0.001884


 68%|██████▊   | 17/25 [7:32:08<3:32:38, 1594.79s/it]

END OF EPOCH: 17 	Training loss per batch: 0.003066	Training_dev loss per batch: 0.002078	Test_dev loss per batch: 0.000943


 72%|███████▏  | 18/25 [7:58:55<3:06:27, 1598.23s/it]

END OF EPOCH: 18 	Training loss per batch: 0.002768	Training_dev loss per batch: 0.002578	Test_dev loss per batch: 0.000519


 76%|███████▌  | 19/25 [8:24:49<2:38:29, 1584.96s/it]

END OF EPOCH: 19 	Training loss per batch: 0.003017	Training_dev loss per batch: 0.002273	Test_dev loss per batch: 0.000982
END OF EPOCH: 20 	Training loss per batch: 0.002828	Training_dev loss per batch: 0.002065	Test_dev loss per batch: 0.000613


 84%|████████▍ | 21/25 [9:19:27<1:47:53, 1618.36s/it]

END OF EPOCH: 21 	Training loss per batch: 0.002755	Training_dev loss per batch: 0.002133	Test_dev loss per batch: 0.000505
END OF EPOCH: 22 	Training loss per batch: 0.002505	Training_dev loss per batch: 0.001926	Test_dev loss per batch: 0.000433


 92%|█████████▏| 23/25 [10:13:20<53:43, 1611.64s/it] 

END OF EPOCH: 23 	Training loss per batch: 0.002418	Training_dev loss per batch: 0.001931	Test_dev loss per batch: 0.000591


 96%|█████████▌| 24/25 [10:39:47<26:44, 1604.28s/it]

END OF EPOCH: 24 	Training loss per batch: 0.002410	Training_dev loss per batch: 0.002199	Test_dev loss per batch: 0.000457


100%|██████████| 25/25 [11:05:15<00:00, 1596.63s/it]

END OF EPOCH: 25 	Training loss per batch: 0.002075	Training_dev loss per batch: 0.001933	Test_dev loss per batch: 0.000414
[1, 8]



  0%|          | 0/25 [00:00<?, ?it/s]

END OF EPOCH: 1 	Training loss per batch: 0.041220	Training_dev loss per batch: 0.025990	Test_dev loss per batch: 0.045805


  8%|▊         | 2/25 [53:53<10:14:10, 1602.19s/it]

END OF EPOCH: 2 	Training loss per batch: 0.024356	Training_dev loss per batch: 0.024706	Test_dev loss per batch: 0.036040


 12%|█▏        | 3/25 [1:20:00<9:43:36, 1591.68s/it]

END OF EPOCH: 3 	Training loss per batch: 0.024396	Training_dev loss per batch: 0.024767	Test_dev loss per batch: 0.033499


 16%|█▌        | 4/25 [1:46:16<9:15:28, 1587.07s/it]

END OF EPOCH: 4 	Training loss per batch: 0.014964	Training_dev loss per batch: 0.008471	Test_dev loss per batch: 0.013197


 20%|██        | 5/25 [2:11:33<8:41:59, 1565.95s/it]

END OF EPOCH: 5 	Training loss per batch: 0.007091	Training_dev loss per batch: 0.005160	Test_dev loss per batch: 0.008455


 24%|██▍       | 6/25 [2:37:56<8:17:31, 1571.16s/it]

END OF EPOCH: 6 	Training loss per batch: 0.005839	Training_dev loss per batch: 0.004640	Test_dev loss per batch: 0.005738


 28%|██▊       | 7/25 [3:04:43<7:54:32, 1581.82s/it]

END OF EPOCH: 7 	Training loss per batch: 0.005206	Training_dev loss per batch: 0.003921	Test_dev loss per batch: 0.004438


 32%|███▏      | 8/25 [3:30:24<7:24:40, 1569.44s/it]

END OF EPOCH: 8 	Training loss per batch: 0.004928	Training_dev loss per batch: 0.005084	Test_dev loss per batch: 0.004136


 36%|███▌      | 9/25 [3:56:51<6:59:58, 1574.93s/it]

END OF EPOCH: 9 	Training loss per batch: 0.004749	Training_dev loss per batch: 0.004939	Test_dev loss per batch: 0.005642
END OF EPOCH: 10 	Training loss per batch: 0.004553	Training_dev loss per batch: 0.003626	Test_dev loss per batch: 0.003295


 44%|████▍     | 11/25 [4:48:54<6:06:38, 1571.29s/it]

END OF EPOCH: 11 	Training loss per batch: 0.004575	Training_dev loss per batch: 0.007114	Test_dev loss per batch: 0.003570
END OF EPOCH: 12 	Training loss per batch: 0.004256	Training_dev loss per batch: 0.003336	Test_dev loss per batch: 0.002595


 52%|█████▏    | 13/25 [5:42:03<5:16:49, 1584.09s/it]

END OF EPOCH: 13 	Training loss per batch: 0.004179	Training_dev loss per batch: 0.002986	Test_dev loss per batch: 0.002380


 56%|█████▌    | 14/25 [6:08:23<4:50:11, 1582.83s/it]

END OF EPOCH: 14 	Training loss per batch: 0.004205	Training_dev loss per batch: 0.003646	Test_dev loss per batch: 0.002374
END OF EPOCH: 15 	Training loss per batch: 0.004180	Training_dev loss per batch: 0.002888	Test_dev loss per batch: 0.001980


 64%|██████▍   | 16/25 [7:00:14<3:56:00, 1573.34s/it]

END OF EPOCH: 16 	Training loss per batch: 0.004053	Training_dev loss per batch: 0.003918	Test_dev loss per batch: 0.002251


 68%|██████▊   | 17/25 [7:26:38<3:30:13, 1576.73s/it]

END OF EPOCH: 17 	Training loss per batch: 0.004188	Training_dev loss per batch: 0.003125	Test_dev loss per batch: 0.001632


 72%|███████▏  | 18/25 [7:52:53<3:03:51, 1575.94s/it]

END OF EPOCH: 18 	Training loss per batch: 0.004056	Training_dev loss per batch: 0.004213	Test_dev loss per batch: 0.002383


 76%|███████▌  | 19/25 [8:17:59<2:35:29, 1554.99s/it]

END OF EPOCH: 19 	Training loss per batch: 0.003993	Training_dev loss per batch: 0.003169	Test_dev loss per batch: 0.001846


 80%|████████  | 20/25 [8:43:11<2:08:30, 1542.10s/it]

END OF EPOCH: 20 	Training loss per batch: 0.003947	Training_dev loss per batch: 0.004722	Test_dev loss per batch: 0.002591
END OF EPOCH: 21 	Training loss per batch: 0.003892	Training_dev loss per batch: 0.002847	Test_dev loss per batch: 0.001692


 88%|████████▊ | 22/25 [9:35:36<1:17:50, 1556.73s/it]

END OF EPOCH: 22 	Training loss per batch: 0.003588	Training_dev loss per batch: 0.002910	Test_dev loss per batch: 0.001458


 92%|█████████▏| 23/25 [10:02:18<52:20, 1570.30s/it] 

END OF EPOCH: 23 	Training loss per batch: 0.003874	Training_dev loss per batch: 0.003343	Test_dev loss per batch: 0.002313


 96%|█████████▌| 24/25 [10:29:07<26:21, 1581.86s/it]

END OF EPOCH: 24 	Training loss per batch: 0.003660	Training_dev loss per batch: 0.004716	Test_dev loss per batch: 0.002492


100%|██████████| 25/25 [10:54:13<00:00, 1570.14s/it]

END OF EPOCH: 25 	Training loss per batch: 0.003753	Training_dev loss per batch: 0.003596	Test_dev loss per batch: 0.002003



