In [1]:
import numpy as np

In [2]:
import pathlib
path_to_file = pathlib.Path('./dataset/deu-eng/deu.txt')

In [3]:
def load_data(path):
    text = path.read_text(encoding='utf-8')
    lines = text.splitlines()
    pairs = [line.split('\t') for line in lines]

    context = np.array([context for target, context,attr in pairs])
    target = np.array([target for target, context,attr in pairs])

    return target, context

In [4]:
target_raw, context_raw = load_data(path_to_file)
print(context_raw[-1])

Ohne Zweifel findet sich auf dieser Welt zu jedem Mann genau die richtige Ehefrau und umgekehrt; wenn man jedoch in Betracht zieht, dass ein Mensch nur Gelegenheit hat, mit ein paar hundert anderen bekannt zu sein, von denen ihm nur ein Dutzend oder weniger nahesteht, darunter höchstens ein oder zwei Freunde, dann erahnt man eingedenk der Millionen Einwohner dieser Welt leicht, dass seit Erschaffung ebenderselben wohl noch nie der richtige Mann der richtigen Frau begegnet ist.


In [5]:
import torchdata.datapipes as dp
import torchtext.transforms as T
import spacy
from torchtext.vocab import build_vocab_from_iterator



In [6]:
eng = spacy.load('en_core_web_sm')
de = spacy.load('de_core_news_sm')

In [7]:
FILE_PATH = './dataset/deu-eng/deu.txt'

data_pipe = dp.iter.IterableWrapper([FILE_PATH])
data_pipe = dp.iter.FileOpener(data_pipe,mode='rb')
data_pipe = data_pipe.parse_csv(skip_lines=0,delimiter='\t',as_tuple=True)

In [8]:
for sample in data_pipe:
    print(sample)
    break

('Go.', 'Geh.', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)')


In [9]:
def removeAttribution(row):
    return row[:2]

data_pipe = data_pipe.map(removeAttribution)

In [10]:
for sample in data_pipe:
    print(sample)
    break

('Go.', 'Geh.')


In [11]:
def engTokenize(text):
    """Tokenize an English text and return a list of tokens"""
    return [token.text for token in eng.tokenizer(text)]

def deTokenize(text):
    """Tokenizer a German text and return a list of tokens"""
    return [token.text for token in de.tokenizer(text)]

In [12]:
print(engTokenize("Have a good day !!!"))

['Have', 'a', 'good', 'day', '!', '!', '!']


In [13]:
print(deTokenize("Haben Sie einen guten Tag!!!"))

['Haben', 'Sie', 'einen', 'guten', 'Tag', '!', '!', '!']


In [14]:
def getTokens(data_iter, place):
    for english, german in data_iter:
        if place == 0:
            yield engTokenize(english)
        else:
            yield deTokenize(german)


In [15]:
source_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,0),
    min_freq=2,
    specials=['<pad>','<sos>','<eos>','<unk>'],
    special_first=True
)
source_vocab.set_default_index(source_vocab['<unk>'])

In [16]:
target_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,1),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
target_vocab.set_default_index(target_vocab['<unk>'])

In [17]:
print(source_vocab.get_itos()[:9])

['<pad>', '<sos>', '<eos>', '<unk>', '.', 'I', 'Tom', 'to', 'you']


In [18]:
len(source_vocab.get_itos()),len(target_vocab.get_itos())

(13610, 24266)

In [19]:
def getTransform(vocab):
    text_transform = T.Sequential(
        T.VocabTransform(vocab=vocab),
        T.AddToken(1, begin=True),
        T.AddToken(2,begin=False)
    )
    return text_transform

In [20]:
temp_list = list(data_pipe)
some_sentence = temp_list[798][0]
print('Some sentence=',end="")
print(some_sentence)
transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence))
print("Transformed sentence",end="")
print(transformed_sentence)
index_to_string = source_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index],end='')

Some sentence=I changed.
Transformed sentence[1, 5, 510, 4, 2]
<sos>Ichanged.<eos>

In [21]:
def applyTransform(sequence_pair):
    return (
        getTransform(source_vocab)(engTokenize(sequence_pair[0])),
        getTransform(target_vocab)(deTokenize(sequence_pair[1]))
    )
data_pipe = data_pipe.map(applyTransform)
temp_list = list(data_pipe)
print(temp_list[0])

([1, 617, 4, 2], [1, 743, 4, 2])


In [22]:
def sortBucket(bucket):
    return sorted(bucket,key=lambda x:(len(x[0]),len(x[1])))

In [23]:
data_pipe = data_pipe.bucketbatch(
    batch_size = 4, batch_num=5,bucket_num=1,
    use_in_batch_shuffle=False,sort_key=sortBucket
)


In [24]:
print(list(data_pipe)[0])

[([1, 5, 837, 195, 2], [1, 7, 22, 913, 24, 2]), ([1, 5, 837, 4, 2], [1, 7, 22, 913, 4, 2]), ([1, 2119, 107, 195, 2], [1, 4732, 5, 1364, 24, 2]), ([1, 5, 1295, 4, 2], [1, 7, 22, 31, 1473, 4, 2])]


In [25]:
def separateSourceTarget(data):
    sources = []
    target = []
    for src,tgt in data:
        sources.append(src)
        target.append(tgt)
    return sources, target

In [26]:
data_pipe = separateSourceTarget(data_pipe)

ValueError: too many values to unpack (expected 2)

In [None]:
print(data_pipe[0][1])

[1, 3030, 4, 2]


In [None]:
sentence = data_pipe[0][543]
source_index_to_string = source_vocab.get_itos()
for index in sentence:
    print(source_index_to_string[index],end='')

<sos>KissTom.<eos>

In [None]:
sentence = data_pipe[1][543]
target_index_to_string = target_vocab.get_itos()
for index in sentence:
    print(target_index_to_string[index],end='')

<sos>KüsseTom!<eos>

In [None]:
def applyPadding(sentences):
    return T.ToTensor(0)(sentences)

In [None]:
src_pad = applyPadding(data_pipe[0])
tgt_pad = applyPadding(data_pipe[1])

In [45]:
dataset = []
for i in range(len(src_pad)):
    dataset.append((src_pad[i],tgt_pad[i]))


In [46]:
dataset[0]

(tensor([  1, 617,   4,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0]),
 tensor([  1, 743,   4,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   

In [47]:
from torch.utils.data import DataLoader

data_loader = DataLoader(dataset,batch_size=64,shuffle=True)

In [48]:
len(data_loader)

4343

In [52]:
from transformer import Transformer
import torch

src_vocab_size = 13610
tgt_vocab_size = 24266
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 128
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

In [73]:
src_data = torch.randint(1, src_vocab_size, (64, 126)) 

In [83]:
print('type',type(src_data))

type <class 'torch.Tensor'>


In [88]:
print(src_data)

tensor([[1762, 1018, 3091,  ..., 1495, 1284, 1716],
        [ 485, 3838, 3967,  ..., 2934, 1690,  555],
        [2070, 3051, 3599,  ..., 4877, 3526, 4000],
        ...,
        [3233,  998, 3065,  ..., 2590,  993, 1436],
        [4559, 3718,  426,  ..., 4844, 4622, 4751],
        [1097, 1925, 4470,  ..., 2806, 1961, 2375]])


In [89]:
for batch,(source,target) in enumerate(data_loader):
    print(source)
    break

tensor([[  1,   6,  13,  ...,   0,   0,   0],
        [  1,   5,  31,  ...,   0,   0,   0],
        [  1, 240,  75,  ...,   0,   0,   0],
        ...,
        [  1,  95,  11,  ...,   0,   0,   0],
        [  1,   6, 698,  ...,   0,   0,   0],
        [  1,   5,  79,  ...,   0,   0,   0]])


In [53]:
epochs = 5

import torch.nn as nn
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

for i in range(epochs):
    for batch,(source,target) in enumerate(data_loader):
        print('source',batch)
        print('source',source.shape)
        print('type',type(source))
        print('target',target.shape)
        output = transformer(source,target[:,:-1])
        # print('output',output.contiguous().view(-1,tgt_vocab_size).shape)
        # print('target',target[:,1:].contiguous().view(-1).shape)
        ##output = transformer(src_data,tgt_data[:,:-1])
        loss = loss_fn(output.contiguous().view(-1,tgt_vocab_size),target[:,1:].contiguous().view(-1))
        ##loss = loss_fn(output.contiguous().view(-1,tgt_vocab_size),tgt_data[:,1:].contiguous().view(-1))
        print('loss',loss)
        break
    break
        

source 0
source torch.Size([64, 113])
type <class 'torch.Tensor'>
target torch.Size([64, 90])
loss tensor(10.2315, grad_fn=<NllLossBackward0>)
