# Dataset

In [1]:
import torch
import torch.nn as nn

In [2]:
# 1. GPU configuration
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
import pickle5 as pickle
from tqdm.auto import tqdm
from torch.utils.data.dataset import Dataset
class MelodyDataset(Dataset):
    def __init__(self, input_data_path, event_dict_path, chord_dict_path, data_type):
        ## -- event dict
        self.event_dict_path = event_dict_path
        self.event2word, self.word2event = pickle.load(open(self.event_dict_path, 'rb'))
        self.event2word['End'] = 308 # cf. 'bar_None' which is the start point of melody => 0 
        self.word2event[308] = 'End'
        self.data_type = data_type
        
        ## -- chord dict
        self.chord_dict_path = chord_dict_path
        with open(self.chord_dict_path, 'rb') as handle:
            self.chord2idx = pickle.load(handle)
        #self.chord2idx = pickle.load(open(self.chord_dict_path, 'rb'))
        
        ## -- input data pair
        self.input_data_path = input_data_path
        with open(self.input_data_path, 'rb') as handle:
            self.all_chord_event_list = pickle.load(handle)
        self.data_num = len(self.all_chord_event_list)
        ## -- chord, event -> idx
        for chord_event_dict in tqdm(self.all_chord_event_list):
            words = []

            ## -- event to idx
            for event in chord_event_dict['Events']:
                e = '{}_{}'.format(event.name, event.value)
                if e in self.event2word:
                    words.append(self.event2word[e])
                else:
                    # OOV
                    if event.name == 'Note Velocity':
                        # replace with max velocity based on our training data
                        words.append(self.event2word['Note Velocity_21'])
                    else:
                        # something is wrong
                        # you should handle it for your own purpose
                        print('something is wrong! {}'.format(e))
            words.append(self.event2word['End'])
            chord_event_dict['Events'] = words

            ## -- chord to idx
            chord_words = []
            for chord in chord_event_dict['Chord']:
                chord_words.append(self.chord2idx[chord])
            chord_event_dict['Chord'] = chord_words
        
        ## -- split train, valid, test
        if data_type == "train":
            self.data_list = self.all_chord_event_list[:int(self.data_num*0.9)]
        elif data_type == "valid":
            self.data_list = self.all_chord_event_list[int(self.data_num*0.9):]

    
    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self,idx):
        input_chords = self.data_list[idx]['Chord']
        output_events = self.data_list[idx]['Events']
        return torch.tensor(input_chords), torch.tensor(output_events)
        

In [6]:
## generate dataset
input_data_path = "./data/all_chord_1_bars.pkl"
event_dict_path =  "./data/event_dictionary.pkl"
chord_dict_path = "./data/chord2idx_dict.pkl"
train_dataset = MelodyDataset(input_data_path, event_dict_path, chord_dict_path,"train")
valid_dataset = MelodyDataset(input_data_path, event_dict_path, chord_dict_path,"valid")

  0%|          | 0/69707 [00:00<?, ?it/s]

  0%|          | 0/69707 [00:00<?, ?it/s]

In [39]:
input_chords, output_events = next(iter(train_dataset))
output_events

tensor([  0,   1,   3, 191,   1,  42,  38,  74,   1,  16,  98,  75,   5,   3,
        136,   5,  52,  91,  69,   6, 116,   6,   3, 191,   6,  42,  36, 117,
          8,   3, 136,   8,  19,  24,  94, 308])

In [38]:
## pack_collate()
from torch.nn.utils.rnn import pack_sequence, PackedSequence
from torch.nn.utils.rnn import pad_packed_sequence
def pack_collate(raw_batch:list):
    input_chords = [x[0] for x in raw_batch]
    output_events = [x[1] for x in raw_batch]
    packed_input_chords = pack_sequence(input_chords, enforce_sorted=False)
    packed_output_events = pack_sequence(output_events, enforce_sorted=False)
    
    return packed_input_chords, packed_output_events

In [37]:
## generate dataloader
from torch.utils.data import DataLoader
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,collate_fn=pack_collate, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE,collate_fn=pack_collate, shuffle=True)

input_chords, output_events = next(iter(train_loader))
input_chords


PackedSequence(data=tensor([11,  7,  0, 16, 12, 28,  1,  4, 17, 12, 24, 21,  0,  7,  4, 55, 16, 11,
        17, 19,  3, 41,  7, 28,  7, 20,  6, 32, 20, 23, 17, 19, 17, 18, 13,  5,
        13,  9,  7, 18,  7,  9, 27,  8, 24, 27, 24, 17,  3, 19,  8,  8, 11, 17,
         4, 24, 20, 18, 16,  1, 21, 27, 20, 23, 10, 11, 23, 17, 29, 21,  1,  4,
        17, 12,  4, 29]), batch_sizes=tensor([64, 12]), sorted_indices=tensor([27, 20, 43, 15, 26, 53, 54, 28,  7, 40, 38, 33, 41, 48, 39, 42, 37, 44,
        45, 46, 36, 47, 32, 49, 50, 51, 52, 55, 56, 57, 58, 59, 60, 61, 62, 63,
        16,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 35, 17, 18, 19,
        21, 22, 23, 24, 25, 29, 30, 31,  0, 34]), unsorted_indices=tensor([62, 37, 38, 39, 40, 41, 42,  8, 43, 44, 45, 46, 47, 48, 49,  3, 36, 51,
        52, 53,  1, 54, 55, 56, 57, 58,  4,  0,  7, 59, 60, 61, 22, 11, 63, 50,
        20, 16, 10, 14,  9, 12, 15,  2, 17, 18, 19, 21, 13, 23, 24, 25, 26,  5,
         6, 27, 28, 29, 30, 31, 32, 33, 34

# Model

In [61]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell
    

In [62]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [63]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

In [64]:
'''
INPUT_DIM = 60
ENC_EMB_DIM = 16

OUTPUT_DIM = 309
DEC_EMB_DIM = 32

HID_DIM = 512
N_LAYERS = 2
'''
INPUT_DIM = 60
ENC_EMB_DIM = 32

OUTPUT_DIM = 309
DEC_EMB_DIM = 64

HID_DIM = 512
N_LAYERS = 3

ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [65]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(60, 32)
    (rnn): LSTM(32, 512, num_layers=3, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(309, 64)
    (rnn): LSTM(64, 512, num_layers=3, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=309, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [66]:
input_chords, output_events = next(iter(train_dataset))
output_events

tensor([  0,   1, 116,   1,   9, 256, 109,  23,  98,  69, 109,  37,  91, 114,
        109,  23, 100,  69, 109,  12,  40,  75,   5,   9, 172,   5,  16,  25,
        114,   5,  16,  32,  71,   5,  62,  34, 104,   6,   9, 221,   8,   9,
        181,   0,   1, 116,   1,   9, 106,   1,  52,  17,  47,   1,  42,  43,
        127,   1,  12,  89,  47,   1,  23,  22, 112,   1,  16,  40,  47,   1,
         23,  32,  54,   5,   9, 255,   6,   9, 181,   8,   9, 173,   0,   1,
        111,   1,   9, 194,   1,  52,  88,  64,   1,  16,  89,  74,   1,  19,
         40,  74,   1,  28,  17,  47,   1,  19,  25,  47,   1,  52,  32, 250,
          5,   9, 136,   6,   9, 217,   8,   9, 191,   0,   1, 283,   1,   9,
        194,   1,  12, 184, 201,   1,  35,  91, 201,   1,  23,  38, 126,   1,
         23, 140,  14,   1,  52,  25, 201,   1,  52,  17, 279,   5,   9, 175,
          6, 283,   6,   9, 194, 120,  52,  34, 127,   8,   9, 172,   8,  12,
        161,  53,  80,  16,  29,  44,  60,  35, 161, 130, 308])

# Train

In [67]:
import torch.optim as optim
LEARNING_RATE = 0.001 # torch default 1e-3
optimizer = optim.Adam(model.parameters(),lr =LEARNING_RATE)
#optimizer = optim.Adam(model.parameters())

In [68]:
import random
import math
import time
import numpy as np
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [69]:
from tqdm.auto import tqdm
def train(epoch,model, train_loader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i,(input_chords, output_events) in tqdm(enumerate(train_loader)):
        
        src = input_chords.transpose(0,1).to(device) # batch_first = False , [srce len, batch_size]
        trg = output_events.transpose(0,1).to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]

        loss = criterion(output, trg)
        #print(loss)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
        
        if i%1000 == 0:    
            torch.save(model.state_dict(), './model_curri/1bar_{}_epoch_{}_step.pt'.format(epoch,i))
            print(loss)
    return epoch_loss / len(train_loader)

In [70]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [71]:
criterion = nn.CrossEntropyLoss()

In [72]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(epoch,model, train_loader, optimizer, criterion, CLIP)
    #valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    '''
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    '''
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

    #print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

0it [00:00, ?it/s]

tensor(5.7374, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.2679, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.9909, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5159, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.1460, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.8760, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.9587, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6128, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.0954, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4863, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4002, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4078, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 01 | Time: 81m 40s
	Train Loss: 2.880 | Train PPL:  17.817


0it [00:00, ?it/s]

tensor(3.5459, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3458, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5219, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3251, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6994, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6167, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5664, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6276, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.9235, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5642, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3943, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3304, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 02 | Time: 81m 36s
	Train Loss: 2.499 | Train PPL:  12.170


0it [00:00, ?it/s]

tensor(2.9262, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3037, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3598, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2940, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6051, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5471, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3792, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5662, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.9733, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2038, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2722, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4140, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 03 | Time: 81m 24s
	Train Loss: 2.417 | Train PPL:  11.210


0it [00:00, ?it/s]

tensor(3.3071, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0964, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3695, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1489, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4682, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2683, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3145, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6616, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.9161, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0778, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2733, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2330, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 04 | Time: 80m 19s
	Train Loss: 2.340 | Train PPL:  10.380


0it [00:00, ?it/s]

tensor(3.0669, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9977, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3886, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0771, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7902, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0482, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4439, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2891, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7669, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0998, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1102, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2514, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 05 | Time: 75m 31s
	Train Loss: 2.280 | Train PPL:   9.772


0it [00:00, ?it/s]

tensor(2.9205, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2294, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2312, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9601, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2677, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0613, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1552, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2982, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6396, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9642, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9248, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1529, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 06 | Time: 75m 33s
	Train Loss: 2.237 | Train PPL:   9.364


0it [00:00, ?it/s]

tensor(2.6693, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0362, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1762, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9418, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4187, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1459, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3737, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2445, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5857, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9025, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8681, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1297, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 07 | Time: 75m 52s
	Train Loss: 2.203 | Train PPL:   9.048


0it [00:00, ?it/s]

tensor(2.5425, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9877, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0987, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0130, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2420, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0719, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2549, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1939, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4704, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8927, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8956, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1586, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 08 | Time: 78m 44s
	Train Loss: 2.174 | Train PPL:   8.794


0it [00:00, ?it/s]

tensor(2.7664, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9524, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4504, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9183, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6078, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0670, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2809, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1650, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5250, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9312, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8694, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0319, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 09 | Time: 78m 36s
	Train Loss: 2.152 | Train PPL:   8.602


0it [00:00, ?it/s]

tensor(2.9944, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0627, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0515, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8772, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.3947, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0513, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0738, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2447, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5042, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0271, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8924, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0879, device='cuda:0', grad_fn=<NllLossBackward>)
Epoch: 10 | Time: 78m 4s
	Train Loss: 2.135 | Train PPL:   8.459
