# Dataset

In [1]:
import torch
import torch.nn as nn

In [2]:
# 1. GPU configuration
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [9]:
import pickle5 as pickle
from tqdm.auto import tqdm
from torch.utils.data.dataset import Dataset
class MelodyDataset(Dataset):
    def __init__(self, input_data_path, event_dict_path, chord_dict_path, data_type):
        ## -- event dict
        self.event_dict_path = event_dict_path
        self.event2word, self.word2event = pickle.load(open(self.event_dict_path, 'rb'))
        self.event2word['End'] = 308 # cf. 'bar_None' which is the start point of melody => 0 
        self.word2event[308] = 'End'
        self.data_type = data_type
        
        ## -- chord dict
        self.chord_dict_path = chord_dict_path
        with open(self.chord_dict_path, 'rb') as handle:
            self.chord2idx = pickle.load(handle)
        #self.chord2idx = pickle.load(open(self.chord_dict_path, 'rb'))
        
        ## -- input data pair
        self.input_data_path = input_data_path
        with open(self.input_data_path, 'rb') as handle:
            self.all_chord_event_list = pickle.load(handle)
        self.data_num = len(self.all_chord_event_list)
        
        ## -- chord, event -> idx
        for chord_event_dict in tqdm(self.all_chord_event_list):
            words = []

            ## -- event to idx
            for event in chord_event_dict['Events']:
                e = '{}_{}'.format(event.name, event.value)
                if e in self.event2word:
                    words.append(self.event2word[e])
                else:
                    # OOV
                    if event.name == 'Note Velocity':
                        # replace with max velocity based on our training data
                        words.append(self.event2word['Note Velocity_21'])
                    else:
                        # something is wrong
                        # you should handle it for your own purpose
                        print('something is wrong! {}'.format(e))
            words.append(self.event2word['End'])
            chord_event_dict['Events'] = words

            ## -- chord to idx
            chord_words = []
            for chord in chord_event_dict['Chord']:
                chord_words.append(self.chord2idx[chord])
            chord_event_dict['Chord'] = chord_words
        
        ## -- split train, valid, test
        if data_type == "train":
            self.data_list = self.all_chord_event_list[:int(self.data_num*0.9)]
        elif data_type == "valid":
            self.data_list = self.all_chord_event_list[int(self.data_num*0.9):]
    
    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self,idx):
        input_chords = self.data_list[idx]['Chord']
        output_events = self.data_list[idx]['Events']
        return torch.tensor(input_chords), torch.tensor(output_events)
        

In [10]:
## check event => idx 
event_dict_path =  "./REMI-tempo-chord-checkpoint/dictionary.pkl"
test = pickle.load(open(event_dict_path, 'rb'))
test

({'Bar_None': 0,
  'Position_1/16': 1,
  'Chord_N:N': 2,
  'Tempo Class_mid': 3,
  'Tempo Value_30': 4,
  'Position_5/16': 5,
  'Position_9/16': 6,
  'Chord_C:maj': 7,
  'Position_13/16': 8,
  'Tempo Class_slow': 9,
  'Tempo Value_33': 10,
  'Position_16/16': 11,
  'Note Velocity_12': 12,
  'Note On_72': 13,
  'Note Duration_1': 14,
  'Tempo Value_25': 15,
  'Note Velocity_14': 16,
  'Note On_76': 17,
  'Note Duration_16': 18,
  'Note Velocity_16': 19,
  'Note On_79': 20,
  'Note On_60': 21,
  'Note On_64': 22,
  'Note Velocity_13': 23,
  'Note On_67': 24,
  'Note On_71': 25,
  'Note On_83': 26,
  'Tempo Value_17': 27,
  'Note Velocity_18': 28,
  'Note On_86': 29,
  'Note Duration_8': 30,
  'Position_8/16': 31,
  'Note On_74': 32,
  'Chord_E:min': 33,
  'Note On_78': 34,
  'Note Velocity_11': 35,
  'Note On_59': 36,
  'Note Velocity_9': 37,
  'Note On_62': 38,
  'Note On_66': 39,
  'Note On_69': 40,
  'Note Duration_13': 41,
  'Note Velocity_17': 42,
  'Note On_81': 43,
  'Note Duratio

In [11]:

## generate dataset
input_data_path = "./data/all_chord_4_bars.pkl"
event_dict_path =  "./data/event_dictionary.pkl"
chord_dict_path = "./data/chord2idx_dict.pkl"
train_dataset = MelodyDataset(input_data_path, event_dict_path, chord_dict_path,"train")
valid_dataset = MelodyDataset(input_data_path, event_dict_path, chord_dict_path,"valid")

  0%|          | 0/17095 [00:00<?, ?it/s]

  0%|          | 0/17095 [00:00<?, ?it/s]

In [12]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=1)
valid_loader = DataLoader(valid_dataset, batch_size=1)

input_chords, output_events = next(iter(train_dataset))
len(input_chords), len(output_events)
len(train_loader), len(valid_loader)

(15385, 1710)

# Model

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell
    

In [14]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [15]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

In [16]:
'''
INPUT_DIM = 60
ENC_EMB_DIM = 16

OUTPUT_DIM = 309
DEC_EMB_DIM = 32

HID_DIM = 512
N_LAYERS = 2
'''
INPUT_DIM = 60
ENC_EMB_DIM = 32

OUTPUT_DIM = 309
DEC_EMB_DIM = 64

HID_DIM = 512
N_LAYERS = 3

ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [17]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(60, 32)
    (rnn): LSTM(32, 512, num_layers=3, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(309, 64)
    (rnn): LSTM(64, 512, num_layers=3, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=309, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [18]:
input_chords, output_events = next(iter(train_dataset))
output_events

tensor([  0,   1, 116,   1,   9, 256, 109,  23,  98,  69, 109,  37,  91, 114,
        109,  23, 100,  69, 109,  12,  40,  75,   5,   9, 172,   5,  16,  25,
        114,   5,  16,  32,  71,   5,  62,  34, 104,   6,   9, 221,   8,   9,
        181,   0,   1, 116,   1,   9, 106,   1,  52,  17,  47,   1,  42,  43,
        127,   1,  12,  89,  47,   1,  23,  22, 112,   1,  16,  40,  47,   1,
         23,  32,  54,   5,   9, 255,   6,   9, 181,   8,   9, 173,   0,   1,
        111,   1,   9, 194,   1,  52,  88,  64,   1,  16,  89,  74,   1,  19,
         40,  74,   1,  28,  17,  47,   1,  19,  25,  47,   1,  52,  32, 250,
          5,   9, 136,   6,   9, 217,   8,   9, 191,   0,   1, 283,   1,   9,
        194,   1,  12, 184, 201,   1,  35,  91, 201,   1,  23,  38, 126,   1,
         23, 140,  14,   1,  52,  25, 201,   1,  52,  17, 279,   5,   9, 175,
          6, 283,   6,   9, 194, 120,  52,  34, 127,   8,   9, 172,   8,  12,
        161,  53,  80,  16,  29,  44,  60,  35, 161, 130, 308])

# Train

In [19]:
import torch.optim as optim
LEARNING_RATE = 0.001 # torch default 1e-3
optimizer = optim.Adam(model.parameters(),lr =LEARNING_RATE)
#optimizer = optim.Adam(model.parameters())

In [20]:
import random
import math
import time
import numpy as np
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [21]:
from tqdm.auto import tqdm
def train(epoch,model, train_loader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i,(input_chords, output_events) in tqdm(enumerate(train_loader)):
        
        src = input_chords.transpose(0,1).to(device) # batch_first = False , [srce len, batch_size]
        trg = output_events.transpose(0,1).to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]

        loss = criterion(output, trg)
        #print(loss)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
        
        if i%1000 == 0:    
            torch.save(model.state_dict(), './model_ckpt/0628_{}_epoch_{}_step.pt'.format(epoch,i))
            print(loss)
    return epoch_loss / len(train_loader)

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [23]:
criterion = nn.CrossEntropyLoss()

In [None]:
N_EPOCHS = 20
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(epoch,model, train_loader, optimizer, criterion, CLIP)
    #valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    '''
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    '''
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

    #print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

0it [00:00, ?it/s]

tensor(5.1870, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.5828, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.5352, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7954, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.0353, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7155, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4202, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7704, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4102, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6113, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5454, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4917, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3454, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6325, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5225, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5176, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 01 | Time: 83m 30s
	Train Loss: 2.861 | Train PPL

0it [00:00, ?it/s]

tensor(3.3051, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.8298, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5190, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2565, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4716, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3856, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2965, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5878, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1991, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3473, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4324, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4451, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2251, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3080, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2057, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4019, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 02 | Time: 84m 10s
	Train Loss: 2.444 | Train PPL

0it [00:00, ?it/s]

tensor(2.8262, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6764, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5563, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1365, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3637, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4227, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0526, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5114, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2119, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3100, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2379, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2599, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1210, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2229, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9787, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3733, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 03 | Time: 83m 44s
	Train Loss: 2.375 | Train PPL

0it [00:00, ?it/s]

tensor(2.7733, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5046, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5125, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2857, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6717, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3924, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1560, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3724, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1289, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3003, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1964, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1597, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0673, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1778, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0774, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3289, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 04 | Time: 83m 57s
	Train Loss: 2.290 | Train PPL

0it [00:00, ?it/s]

tensor(2.8940, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3823, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3377, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2784, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6788, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1208, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2920, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4097, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9777, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3159, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2159, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2983, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9770, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2283, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9804, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2831, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 05 | Time: 83m 38s
	Train Loss: 2.224 | Train PPL

0it [00:00, ?it/s]

tensor(2.9798, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3084, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2517, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1298, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2352, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0711, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2388, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3165, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9628, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3845, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1599, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0511, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9316, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1246, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9090, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1868, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 06 | Time: 83m 44s
	Train Loss: 2.180 | Train PPL

0it [00:00, ?it/s]

tensor(2.7193, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2668, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2240, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2879, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1342, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0549, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0231, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1436, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9838, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4044, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1094, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0961, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8911, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0954, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1439, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1806, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 07 | Time: 83m 38s
	Train Loss: 2.145 | Train PPL

0it [00:00, ?it/s]

tensor(2.6714, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2095, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2151, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0391, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1620, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9879, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9418, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2725, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9286, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2483, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0663, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0237, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9225, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9350, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0376, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1654, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 08 | Time: 83m 45s
	Train Loss: 2.122 | Train PPL

0it [00:00, ?it/s]

tensor(3.0830, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3366, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3133, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1350, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1122, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9682, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8670, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2347, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9322, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2813, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0561, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9588, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9432, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9938, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0735, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2549, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 09 | Time: 83m 46s
	Train Loss: 2.104 | Train PPL

0it [00:00, ?it/s]

tensor(2.5501, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2604, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1598, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2615, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1248, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9875, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9368, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2199, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.7704, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0960, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0088, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9992, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8998, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8802, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0930, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1329, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 10 | Time: 83m 37s
	Train Loss: 2.089 | Train PPL

0it [00:00, ?it/s]

tensor(2.5601, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2190, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3043, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0089, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1563, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9392, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8377, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2744, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9564, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1922, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9405, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0125, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8157, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9998, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3794, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1615, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 11 | Time: 83m 53s
	Train Loss: 2.076 | Train PPL

0it [00:00, ?it/s]

tensor(2.5548, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2916, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7393, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1704, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0557, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9827, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8393, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2249, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8431, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0504, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0444, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9140, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8508, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9839, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0242, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2023, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 12 | Time: 83m 47s
	Train Loss: 2.064 | Train PPL

0it [00:00, ?it/s]

tensor(2.4307, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1516, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2634, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9926, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9853, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8936, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9599, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0746, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8111, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0164, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0252, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0246, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8051, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9890, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9885, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1242, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 13 | Time: 84m 26s
	Train Loss: 2.054 | Train PPL

0it [00:00, ?it/s]

tensor(2.3122, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2463, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2279, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1001, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2043, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8269, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9629, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0673, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8284, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0394, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0568, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9488, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9513, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9696, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8758, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1800, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 14 | Time: 84m 15s
	Train Loss: 2.047 | Train PPL

0it [00:00, ?it/s]

tensor(3.2831, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2583, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1597, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9909, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0309, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0185, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8548, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1620, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.7639, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2623, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0372, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0261, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8539, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3314, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9987, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1661, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 15 | Time: 84m 47s
	Train Loss: 2.039 | Train PPL

0it [00:00, ?it/s]

tensor(2.5910, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1391, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2577, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0617, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2630, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8279, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8528, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1881, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8050, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0669, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0788, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9982, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8174, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0064, device='cuda:0', grad_fn=<NllLossBackward>)
