In [1]:
import os
import sys
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset,DataLoader
from torchtext.legacy.datasets import Multi30k
from sklearn.model_selection import train_test_split
import math
import spacy
import pickle
import time
import copy
print(torch.__version__)
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
from nltk.tokenize import word_tokenize


1.8.1


In [2]:
D_MODEL = 256
MAX_SEQ_LENGTH = 100
BATCH_SIZE = 32
Nx=3
N_HEADS = 4
EPOCH = 100

In [3]:
!python3 -m spacy download en
!python3 -m spacy download it

In [4]:

t0 = time.time()
spacy_italian = spacy.load("it_core_news_sm")
spacy_english = spacy.load("en_core_web_sm")

def tokenize_italian(text):
    return [token.text for token in spacy_italian.tokenizer(text)]

def tokenize_english(text):
    return [token.text for token in spacy_english.tokenizer(text)]

italian = Field(tokenize=tokenize_italian, lower=True,
               init_token="<sos>", eos_token="<eos>")

english = Field(tokenize=tokenize_english, lower=True,
               init_token="<sos>", eos_token="<eos>")




eng_file_path = "/Users/stephone_christian/Downloads/it-en/europarl-v7.it-en.en"
it_file_path = "/Users/stephone_christian/Downloads/it-en/europarl-v7.it-en.it"

europarl_en = open(eng_file_path, encoding='utf-8').read().split('\n')
europarl_it = open(it_file_path, encoding='utf-8').read().split('\n')


raw_data = {'English' : [line for line in europarl_en], 'Italian': [line for line in europarl_it]}

df = pd.DataFrame(raw_data, columns=["English", "Italian"])
train, test = train_test_split(df, test_size=0.3, random_state=42)


fields = [('English', english), ('Italian', italian)]



data_path = "/Users/stephone_christian/Downloads/it-en/data"
train.to_csv(data_path + "/train.csv", index=False)
test.to_csv(data_path + "/test.csv", index=False)

t1 = time.time()
print(f'time elapsed in minutes, {(t1 - t0) / 60}')






train_data, test_data = TabularDataset.splits( path=data_path,
                                 train="train.csv",
                                 test="test.csv",
                                 format="csv",
                                 fields=fields,
                                 skip_header=True)


t2 = time.time()
print(f'time elapsed in seconds after Tab dataset {(t2 - t1) / 60}')

italian.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

#bucketIterator
train_iterator, test_iterator = BucketIterator.splits((train_data, test_data), 
                                                                      batch_size = BATCH_SIZE, 
                                                                      sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.Italian))


print(f"Unique tokens in source (it) vocabulary: {len(italian.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(english.vocab)}")


t3 = time.time()
print(f'time elapsed in seconds after build_vocab + BucketIterator {(t3 - t2) / 60}')





time elapsed in minutes, 0.4317539652188619
time elapsed in seconds after Tab dataset 4.983298150698344
Unique tokens in source (it) vocabulary: 10004
Unique tokens in target (en) vocabulary: 10004
time elapsed in seconds after build_vocab + BucketIterator 0.23890841404596966


In [5]:
#let's just use spacy and use a preprocessed dataset. In future I can come back
#train_iterator.create_batches()
for x in train_iterator:
    
    print(x.English.shape)
    break


torch.Size([31, 32])


In [6]:
#PositionalEncoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_length=5000):
        super(PositionalEncoding, self).__init__()
        
        self.d_model = d_model
        pe = torch.zeros(max_length,d_model, dtype=torch.float32)
        for pos in range(max_length):
            for i in range(0, d_model-1, 2):
                pe[pos, i] = math.sin(pos / (float(10000)**(2*i / d_model)))
                pe[pos, i+1] = math.cos(pos / (float(10000)**(2*i / d_model)))
                
        pe = pe.unsqueeze(0)
                
        self.register_buffer("pe", pe)
                
    def forward(self, x):
        x *= math.sqrt(self.d_model)
        x += self.pe[:,:x.shape[1]]

        return x
    
    
#embedding layer
class EmbeddingLayer(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(EmbeddingLayer, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        
    def forward(self, x):
        return self.embedding(x)
    
        
#layerNormalization add and norm ( x = x + sublayer)
class LayerNormalization(nn.Module):
    def __init__(self, d_model=D_MODEL):
        super(LayerNormalization, self).__init__()
        
        self.eps = 1e-6
        self.size = d_model
        
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        
    def forward(self, x):
        x_mean = x.mean(dim=-1, keepdim =True)
        x_std = x.std(dim=-1, keepdim=True)
        
        out = self.alpha* ((x - x_mean) / (x_std + self.eps)) + self.bias
        return out
        
#blue subbox in the paper #check the sizing
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.fc_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.fc_2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out = self.fc_1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc_2(out)
        return out
    
    
def attention(q, k, v, d_k,  mask=None, dropout=None):
    
    
    #print("attention:", q.shape)
    #print("attention:", k.transpose(-2, -1).shape)
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    
    #print("scores shape:", scores.shape)
    if mask != None:
        #print("mask shape before unsqueeze:",mask.shape)
        mask = mask.unsqueeze(1)
        #print("mask shape after unsqueeze:", mask.shape)
        #print(score.shape)
        scores = scores.masked_fill(mask == 0, -1e9)
    scores = torch.nn.functional.softmax(scores, dim=-1)

    if dropout is not None:
        scores = dropout(scores)

    output = torch.matmul(scores, v)
    return output  
    

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, drop_out=0.1):
        super().__init__()
        
        self.Q = nn.Linear(d_model, d_model)
        self.K = nn.Linear(d_model, d_model)
        self.V = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(drop_out) 
        self.out = nn.Linear(d_model, d_model)
        
        self.h = N_HEADS
        self.d_k = d_model // N_HEADS
        self.d_model = d_model
        
        
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0) #batch size
        
        # perform linear operation and split into h heads
        k = self.K(k).view(bs, -1, self.h, self.d_k)
        q = self.Q(q).view(bs, -1, self.h, self.d_k)
        v = self.V(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * h * sl * d_model
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
# calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        
        output = self.out(concat)
    
        return output


        
        
        
def model_blocks(module, N=Nx):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])


class EncoderBlock(nn.Module):
    def __init__(self, d_model, drop_out=0.1):
        super().__init__()
    
        self.norm_1 = LayerNormalization()
        self.ff = FeedForward(d_model)
        #self.pe = PositionalEncoding(d_model, MAX_SEQ_LEN)
        self.attn = MultiHeadAttention(d_model, drop_out)
        self.norm_2 = LayerNormalization()
        self.drop_1 = nn.Dropout(drop_out)
        self.drop_2 = nn.Dropout(drop_out)
            
    def forward(self, x, mask=None):
        #forward(q, k, v, mask=None)
        #print("MADE TO ENCODER!!!!!!!!!")
        x_attn = self.drop_1(self.attn(x, x, x, mask))
        x_out_1 = self.norm_1(x+x_attn)
        out = self.norm_2(self.drop_2(self.ff(x_out_1)) + x_out_1)
        return out
        
        

class DecoderBlock(nn.Module):
    def __init__(self, d_model, drop_out=0.1):
        super().__init__()
        self.norm_1 = LayerNormalization()
        self.norm_2 = LayerNormalization()
        self.norm_3 = LayerNormalization()
        self.ff     = FeedForward(d_model)
        self.drop_1 = nn.Dropout(drop_out)
        self.drop_2 = nn.Dropout(drop_out)
        self.drop_3 = nn.Dropout(drop_out)
        
        self.att_1 = MultiHeadAttention(d_model)
        self.att_2 = MultiHeadAttention(d_model)
        
        
    def forward(self, x, e_outputs,  src_msk, trg_msk):
        out = self.drop_1(self.att_1(x, x, x, trg_msk))
        x2 = self.norm_1(x + out)
        x3 = self.drop_2(self.att_2(x2, e_outputs, e_outputs, src_msk))
        
        out = self.norm_2(x3 + x2)
        
        x4 = self.drop_3(self.ff(out))
        out = self.norm_3(x4 + out)
        
        return out
        
        
class Decoder(nn.Module):
    def __init__(self, d_model, vocab):
        super(Decoder, self).__init__()
        
        self.emb = EmbeddingLayer(d_model, vocab)
        self.pos = PositionalEncoding(d_model)
        self.dec_blocks = model_blocks(DecoderBlock(d_model), Nx)
        
        #self.norm_1 = LayerNormalization()
        #self.softmax = nn.Softmax() #might not need this
        
        
    def forward(self, src, e_outputs, src_msk, trg_msk):
        x = self.emb(src)
        x = self.pos(x)
        
        for layer in self.dec_blocks:
            x = layer(x, e_outputs, src_msk, trg_msk) # add e_output to this
            
        #x = self.norm_1(x)
        return x

class Encoder(nn.Module):
    def __init__(self, d_model, vocab):
        super(Encoder, self).__init__()
        
        self.emb = EmbeddingLayer(d_model, vocab)
        self.pos = PositionalEncoding(d_model)
        self.enc_blocks = model_blocks(EncoderBlock(d_model), Nx)
        
    def forward(self, src, msk):
        out = self.emb(src)
        out = self.pos(out)
        
        for layer in self.enc_blocks:
            out = layer(out, msk)
        return out
    
    
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model):
        super().__init__()
        self.encoder = Encoder(d_model, src_vocab_size)
        self.decoder = Decoder(d_model, trg_vocab_size)
        self.fc = nn.Linear(d_model, trg_vocab_size)
        
    def forward(self, src, trg, src_msk, trg_msk):
        e_outputs = self.encoder(src, src_msk)
        out = self.decoder(trg, e_outputs, src_msk, trg_msk)
        out = self.fc(out)
        return out
    
        
    
    
    
    
    
        
        
        
        
        
        
    

In [8]:
#create optimizer

LEARNING_RATE = 1e-4

src_vocab_size = len(english.vocab)
trg_vocab_size = len(italian.vocab)
model = Transformer(src_vocab_size, trg_vocab_size, D_MODEL)

criterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE, betas=(0.9, 0.98), eps= 1e-9)



#initialize parameters with xavier initialization

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)




In [9]:
def create_src_mask(src):
    input_pad = english.vocab.stoi['<pad>']
    # creates mask with 0s wherever there is padding in the input
    src_msk = (src != input_pad).unsqueeze(1)
    
    return src_msk


def create_trg_mask(trg):
    input_pad = italian.vocab.stoi['<pad>']
    trg_msk = (trg != input_pad).unsqueeze(1)
    #print("trg_mask shape", trg_msk.shape)
    
    size = trg.size(1) # get seq_len for matrix
    #print("trg size", size)
    nopeak_mask = np.triu(np.ones((1, size, size)),
    k=1)

    nopeak_mask = torch.from_numpy(nopeak_mask) == 0
    
    trg_msk = trg_msk & nopeak_mask
    return trg_msk

In [None]:
#with a batch size of 32, there are ~62,500 updated per epoch :)
model_path_root = "/Users/stephone_christian/Downloads/it-en/data/model_checkpoints/"
loss_write_path = "/Users/stephone_christian/Downloads/it-en/data/losses/loss.txt"

f = open(loss_write_path, "w")
f.close() #clean the file



checkpoint = torch.load(model_path_root + "3_30000")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
print(epoch)
print("loss:", loss)
model.train()

t0 = time.time()
for epoch in range(EPOCH):
    i = 0
    losses = []
    time_prev = time.time()
    for data in train_iterator:
        #losses.append(2)
        
        
        
        
        src = data.English
        trg = data.Italian
        
        #create the masks!
        
        
        #reshape dimensions to fit the model
        src = src.transpose(0,1) #transpose columns and rows, leave batch dimension alone
        trg = trg.transpose(0, 1)
        #print("src shape", src.shape)
        #print("trg shape", trg.shape)
        trg_input = trg[:,:-1]
        targets = trg[:, 1:].contiguous().view(-1)
        
        src_mask = create_src_mask(src)
        #print("src mask shape", src_mask.shape)
        trg_mask = create_trg_mask(trg_input)
        
        y_pred = model(src, trg_input, src_mask, trg_mask)
        
        loss = criterion(y_pred.view(-1, y_pred.size(-1)), targets)
        
        loss.backward()
        
        optimizer.step()
        
        optimizer.zero_grad()
        losses.append(loss.item())
        
        
        
        if i % 100 == 0: #every 100 steps
            #pass
            print(f'step {i+1} of epoch {epoch}, loss: {loss.item():.6f}, elapsed time 1k steps (min) {((time.time() - time_prev) / 60):.5f}, total elapsed time (h) {((time.time() - t0) / 3600):.5f}')
            time_prev = time.time()
        
        #if i % 1000 == 0:
            
                    
        if i % 5000 == 0:
            #checkpoint the model
            torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss.item(),
            }, model_path_root + str(epoch+2) + "_" + str(i))
        i +=1
            
        
        
    with open(loss_write_path, 'a') as f:
        for l in losses:
            f.write(str(l) + "\n")

In [11]:
model = Transformer(src_vocab_size, trg_vocab_size, D_MODEL)

checkpoint = torch.load(model_path_root + "2_5000")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
print(epoch)
print("loss:", loss)
model.eval()


0
loss: 1.7252687215805054


Transformer(
  (encoder): Encoder(
    (emb): EmbeddingLayer(
      (embedding): Embedding(10004, 256)
    )
    (pos): PositionalEncoding()
    (enc_blocks): ModuleList(
      (0): EncoderBlock(
        (norm_1): LayerNormalization()
        (ff): FeedForward(
          (fc_1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (fc_2): Linear(in_features=2048, out_features=256, bias=True)
          (relu): ReLU()
        )
        (attn): MultiHeadAttention(
          (Q): Linear(in_features=256, out_features=256, bias=True)
          (K): Linear(in_features=256, out_features=256, bias=True)
          (V): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm_2): LayerNormalization()
        (drop_1): Dropout(p=0.1, inplace=False)
        (drop_2): Dropout(p=0.1, inplace=False)
  

In [64]:
def translate(model, src, max_len = 80):
    
    model.eval()
    input_pad = english.vocab.stoi['<pad>']
    
    #print("Here")
    src = tokenize_english(src)
    sentence= torch.LongTensor([[english.vocab.stoi[tok] for tok
    in src]])
    src_mask = (sentence != input_pad).unsqueeze(-2)
    e_outputs = model.encoder(sentence, src_mask)

    outputs = torch.zeros(max_len, dtype=torch.long) #.type_as(src.data)
    #print(type(outputs))
    outputs[0] = torch.LongTensor([italian.vocab.stoi['<sos>']])
        
    for i in range(1, max_len):    

        trg_mask = np.triu(np.ones((1, i, i)),k=1) #.astype('uint8')
        trg_mask = torch.from_numpy(trg_mask) == 0

        out = model.fc(model.decoder(outputs[:i].unsqueeze(0),
        e_outputs, src_mask, trg_mask))
        out = torch.nn.functional.softmax(out, dim=-1)
        val, ix = out[:, -1].data.topk(1)

        outputs[i] = ix[0][0]
        if ix[0][0] == italian.vocab.stoi['<eos>']:
            break
    return ' '.join(
        [italian.vocab.itos[ix] for ix in outputs[:i]]
        )

In [65]:
print(translate(model, "I cannot agree to that, this is wrong <eos>" ))

<sos> non può essere d' accordo , questo è sbagliato <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>
