In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset
from torchtext.vocab import Vectors

from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score

import spacy
import numpy as np

import random
import math
import time
import os

In [2]:
spacy_en = spacy.load('en')

In [3]:
def tokenize_rev(text):
    return [tok.text for tok in spacy_en.tokenizer(text)][::-1]

def tokenize_forw(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [4]:
SRC = Field(tokenize = tokenize_forw, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_forw, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [5]:
data_fields = [('ID', None), ('query', SRC), ('target', TRG)]
train_data = TabularDataset(path=os.path.join('s2s_train.csv'), format='csv', skip_header=True, fields=data_fields)
test_data = TabularDataset(path=os.path.join('s2s_test.csv'), format='csv', skip_header=True, fields=data_fields)

In [6]:
print(vars(train_data.examples[0]))
print(vars(train_data.examples[1]))

{'query': ['i', 'tend', 'to', 'agree', 'with', 'nyt', "'s", 'review', 'which', 'says', ':'], 'target': ['this', 'movie', 'is', 'brings', 'out', 'the', 'wholesome', ',', 'affirmative', 'side', 'of', 'the', 'hip', '-', 'hop', 'aesthetic', 'without', 'being', 'overly', 'preachy']}
{'query': ['i', 'agree', ',', 'it', "'s", 'one', 'of', 'the', 'gems', 'that', 'we', 'get', 'each', 'year', '.'], 'target': ['i', 'love', 'that', 'part', 'when', 'honey', 'gets', 'a', 'bank', 'loan', ',', 'and', 'puts', 'down', 'a', 'deposit', '.', 'what', "'s", 'your', 'favorite', '?']}


In [7]:
%ls

SRC.build_vocab(train_data, vectors=Vectors('Glove/glove.6B.100d.txt'))
TRG.build_vocab(train_data, vectors=Vectors('Glove/glove.6B.100d.txt'))

'Data _ Towards Exploiting Background Knowledge for Building Conversation Systems (EMNLP 2018).zip'
 [0m[01;34mGlove[0m/
 [01;34mholle[0m/
'Memory Network with Attention.ipynb'
 [01;34m__pycache__[0m/
 s2s_test.csv
 s2s_train.csv
 s2s_train_small.csv
 Seq2Seq-2.ipynb
 Seq2Seq.ipynb
 seq2seq.py
'Transform data from HollE to HuggingFace.ipynb'
 tut1-model.pt
 Untitled.ipynb


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

In [9]:
BATCH_SIZE = 16

train_data, valid_data = train_data.split(split_ratio=0.7)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, sort=False, sort_within_batch=False,
    device=device)

In [10]:
from seq2seq-2 import Encoder, Decoder, Seq2Seq

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 100
N_LAYERS = 4
ENC_DROPOUT = 0.2
DEC_DROPOUT = 0.2

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT, SRC.vocab.vectors)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, TRG.vocab.vectors)

model = Seq2Seq(enc, dec, device).to(device)

In [11]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(16304, 100)
    (rnn): LSTM(100, 100, num_layers=2, dropout=0.2, bidirectional=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(23437, 100)
    (rnn): LSTM(100, 100, num_layers=4, dropout=0.2)
    (fc_out): Linear(in_features=100, out_features=23437, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,067,637 trainable parameters


In [13]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                    mode='min', factor=0.5,
                                                    patience=1)

In [14]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [15]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    record_size = 0
    out_preds = []
    ground_preds = []
    flatten = lambda l: [item for sublist in l for item in sublist]
    
    for i, batch in enumerate(iterator):
        
        
        src = batch.query
        trg = batch.target

        optimizer.zero_grad()
        
        output = model(src, trg)
        
        record_size += trg.shape[1]
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        out_preds.append(torch.transpose(torch.argmax(torch.softmax(output, dim=2), dim=2), 0,1).tolist())
        ground_preds.append(torch.transpose(trg, 0,1).tolist())
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
        if i % 100 == 0:
            print('train_loss :', epoch_loss/record_size, 'records :', record_size)
            
    ground_preds = flatten(ground_preds)
    out_preds = flatten(out_preds)
    
    ground_tokens = []
    for ex in ground_preds:
        ground_tokens.append([TRG.vocab.itos[token] for token in ex])
        
    pred_tokens = []
    for ex in out_preds:
        pred_tokens.append([TRG.vocab.itos[token] for token in ex])
        
    return epoch_loss / record_size, f1_score(ground_tokens, pred_tokens)

In [16]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    record_size = 0
    out_preds = []
    ground_preds = []
    flatten = lambda l: [item for sublist in l for item in sublist]
    
    def devectorize(vec):
      idxs = torch.nonzero(vec)
      if len(idxs) == 0:
        return 0
      else:
        sm = torch.softmax(vec, dim=0)
        idx = sm.argmax(0)
    #     print(idx)
        return idx

    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.query
            trg = batch.target

            record_size += trg.shape[1]
            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]
            out_preds.append(torch.transpose(torch.argmax(torch.softmax(output, dim=2), dim=2), 0,1).tolist())
            ground_preds.append(torch.transpose(trg, 0,1).tolist())

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
            if i % 100 == 0:
                print('val_loss : ', epoch_loss/record_size, 'records :', record_size)

    ground_preds = flatten(ground_preds)
    out_preds = flatten(out_preds)
    
    ground_tokens = []
    for ex in ground_preds:
        ground_tokens.append([TRG.vocab.itos[token] for token in ex])
        
    pred_tokens = []
    for ex in out_preds:
        pred_tokens.append([TRG.vocab.itos[token] for token in ex])

    return epoch_loss / record_size, f1_score(ground_tokens, pred_tokens)

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_f1 = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss, valid_f1 = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
        
    scheduler.step(valid_loss)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train F1: {train_f1*100:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} | Val. F1: {valid_f1*100:.3f}')

train_loss : 0.6288149356842041 records : 16
train_loss : 0.45481122838388577 records : 1616
train_loss : 0.4321626645712117 records : 3216
train_loss : 0.42190233348215933 records : 4816
train_loss : 0.4159653073088487 records : 6416
train_loss : 0.41250308854375295 records : 8016
train_loss : 0.41002835375497027 records : 9616
train_loss : 0.40800518515956896 records : 11216
train_loss : 0.4063236153825243 records : 12816
train_loss : 0.4046390675810413 records : 14416
train_loss : 0.40322299261431355 records : 16016
train_loss : 0.4020212767992531 records : 17616
train_loss : 0.40071226335087984 records : 19216
train_loss : 0.3997367474319933 records : 20816
train_loss : 0.398749068836754 records : 22416
train_loss : 0.39795935775422003 records : 24016
val_loss :  0.37450674176216125 records : 16
val_loss :  0.3870857461254195 records : 1616
val_loss :  0.38812942973416836 records : 3216
val_loss :  0.3881105190495716 records : 4816
val_loss :  0.3883895394659399 records : 6416
val_

val_loss :  0.38342178653721787 records : 3216
val_loss :  0.3834229557221118 records : 4816
val_loss :  0.3838606479756553 records : 6416
val_loss :  0.38436240725174636 records : 8016
val_loss :  0.38456738084405906 records : 9616
Epoch: 07 | Time: 5m 40s
	Train Loss: 0.339 | Train PPL:   1.404 | Train F1: 7.590
	 Val. Loss: 0.385 |  Val. PPL:   1.469 | Train F1: 6.573
train_loss : 0.3199482858181 records : 16
train_loss : 0.33482715162900417 records : 1616
train_loss : 0.3372305149759226 records : 3216
train_loss : 0.3369194121653851 records : 4816
train_loss : 0.33666210437653366 records : 6416
train_loss : 0.3368671853504257 records : 8016
train_loss : 0.3366155875304376 records : 9616
train_loss : 0.33630806437232524 records : 11216
train_loss : 0.33646155400817906 records : 12816
train_loss : 0.33662181057358426 records : 14416
train_loss : 0.33635893291407654 records : 16016
train_loss : 0.33644561839038734 records : 17616
train_loss : 0.33669838942060065 records : 19216
train_

In [19]:
model.load_state_dict(torch.load('tut1-model.pt'))
model.to(device)
test_loss, test_f1 = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Test F1: {test_f1*100:.3f}')

val_loss :  0.40284645557403564 records : 16
val_loss :  0.3827825684358578 records : 1616
val_loss :  0.3851188661447212 records : 3216
| Test Loss: 0.388 | Test PPL:   1.474 | Test F1: 6.872


In [20]:
example_sentence = "i agree ,it 's one of the gems that we get each year ."
doc = spacy_en(example_sentence)

tokens = [token.text for token in doc]
input_example = SRC.process([tokens])
# print(input_example)
# print(SRC.vocab.itos[input_example[0][0]])
tgt = torch.zeros(50,dtype=torch.long)

tgt[0] = TRG.vocab.stoi[TRG.init_token]
tgt = tgt.unsqueeze(1)
cpu_mod = model.cpu()
output = cpu_mod(input_example, tgt, 0.).squeeze()
# print(output.shape)

def devectorize(vec):
  idxs = torch.nonzero(vec)
  if len(idxs) == 0:
    return 0
  else:
    sm = torch.softmax(vec, dim=0)
    idx = sm.argmax(0)
#     print(idx)
    return idx

outvec = [TRG.vocab.itos[devectorize(token)] for token in output]

sentence = []
for token in outvec:
  if token == TRG.eos_token:
    break
  sentence.append(token)

print(" ".join(sentence))

<unk> i , , the the the the ,
