In [22]:
import pandas as pd
import torch

In [21]:
context_length = 32
batch_size = 16

In [50]:
df = pd.read_csv('eng_to_french.zip')

print(df.shape)

df.head()

(175621, 2)


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [51]:
eng_text = df['English words/sentences'].tolist()
french_text = df['French words/sentences'].tolist()

In [52]:
eng_chars = set()

for sent in eng_text:
    for ch in sent:
        if ch not in eng_chars:
            eng_chars.add(ch)

fr_chars = set()
for sent in french_text:
    for ch in sent:
        if ch not in fr_chars:
            fr_chars.add(ch)

eng_chars = sorted(list(eng_chars))
fr_chars = sorted(list(fr_chars))

vocab_size_eng = len(eng_chars)
vocab_size_fr = len(fr_chars)

print(">> English Vocabulary: ")
print(eng_chars)
print(f"Size: {vocab_size_eng}")

print(">> French Vocabulary: ")
print(fr_chars)
print(f"Size: {vocab_size_fr}")

>> English Vocabulary: 
[' ', '!', '"', '$', '%', '&', "'", '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '\xad', 'º', 'ç', 'é', 'ö', 'ú', 'а', '–', '—', '‘', '’', '₂', '€']
Size: 91
>> French Vocabulary: 
[' ', '!', '"', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '«', '»', 'À', 'Â', 'Ç', 'É', 'Ê', 'Ô', 'à', 'á', 'â', 'ç', 'è', 'é', 'ê', 'ë'

In [53]:
class CharTokenizer:
    def __init__(self, chars, specials=('<pad>', '<unk>', '<bos>', '<eos>')):
        self.specials = list(specials)
        self.chars = self.specials + sorted(set(chars))
        self.stoi = {ch: i for i, ch in enumerate(self.chars)}
        self.itos = {i: ch for i, ch in enumerate(self.chars)}

        self.pad = self.stoi['<pad>']
        self.unk = self.stoi['<unk>']
        self.bos = self.stoi['<bos>']
        self.eos = self.stoi['<eos>']

    def encode(self, s, add_bos=True, add_eos=True):
        ids = [self.stoi.get(ch, self.unk) for ch in s]
        if add_bos:
            ids = [self.bos] + ids
        if add_eos:
            ids = ids + [self.eos]
        return ids

    def decode(self, ids, remove_specials=True):
        if remove_specials:
            ids = [i for i in ids if i not in (self.pad, self.bos, self.eos)]
        return ''.join([self.itos[i] for i in ids])

In [54]:
eng_tokenizer = CharTokenizer(eng_chars)
fr_tokenizer = CharTokenizer(fr_chars)
print(eng_tokenizer.encode('Hii there!'))
print(eng_tokenizer.decode(eng_tokenizer.encode('Hii there!'), remove_specials=False))

[2, 36, 63, 63, 4, 74, 62, 59, 72, 59, 5, 3]
<bos>Hii there!<eos>


In [55]:
eng_data = [eng_tokenizer.encode(sent) for sent in eng_text]
fr_data = [fr_tokenizer.encode(sent) for sent in french_text]

train_len = int(0.9 * len(eng_data))
eng_data_train, eng_data_val = eng_data[:train_len], eng_data[train_len:]
fr_data_train, fr_data_val = fr_data[:train_len], fr_data[train_len:]

In [61]:
def get_batch_and_padding(split='train'):
    eng_data = eng_data_train if split == 'train' else eng_data_val
    fr_data = fr_data_train if split == 'train' else fr_data_val

    def pad(ids, max_len, pad_id):
        ids = ids[:max_len]
        return ids + [pad_id] * (max_len - len(ids))

    # sample random idxs for batching
    idxs = torch.randint(len(eng_data) - context_length, (batch_size,))
    
    # apply padding and create batch
    src_batch = torch.stack([
        torch.tensor(pad(eng_data[i], context_length, eng_tokenizer.pad), dtype=torch.long)
        for i in idxs
    ])
    tgt_batch = torch.stack([
        torch.tensor(pad(fr_data[i], context_length, fr_tokenizer.pad), dtype=torch.long)
        for i in idxs
    ])
    
    # create padding mask
    src_mask = src_batch != eng_tokenizer.pad
    tgt_mask = tgt_batch != fr_tokenizer.pad

    return src_batch, tgt_batch, src_mask, tgt_mask

In [62]:
src_batch, tgt_batch, src_mask, tgt_mask = get_batch_and_padding()

In [79]:
print(eng_tokenizer.decode(src_batch[1].tolist()))

Somebody touched me.


In [78]:
print(fr_tokenizer.decode(tgt_batch[1].tolist()))

Quelqu'un m'a touché.
