# Importing relevant libraries

In [14]:
#__future__ to bring features from newer versions of Python 
# from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

import string
from string import digits
import re
import time
import numpy as np
import pandas as pd
import collections
import unicodedata

import os

from bert import BertModelLayer
from bert.loader import StockBertConfig, load_stock_weights

Init Plugin
Init Graph Optimizer
Init Kernel


In [15]:
tf.config.list_physical_devices('GPU')
print("GPU Available: ", tf.test.is_gpu_available())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Available: Metal device set to: Apple M1
 True


2022-07-30 04:22:51.568174: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-30 04:22:51.568617: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# Cleaning the data

In [16]:
#reading data from NLD-ENG translation dataset
with open('nld-eng/nld.txt','r') as f:
  data = f.read()

In [17]:
#removing trailing and leading characters
#creating list of word translations by splitting on new line
uncleaned_data_list = data.strip().split('\n')
len(uncleaned_data_list)

68954

In [18]:
print(uncleaned_data_list[4])

Hi.	Hai!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #6117420 (Raizin)


In [19]:
for word in uncleaned_data_list:
    print(word.split('\t'))
    break

['Go.', 'Lopen!', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #7764436 (LinguisticFusion)']


In [20]:
for word in uncleaned_data_list:
    print(word.split('\t')[:-1])
    break

['Go.', 'Lopen!']


In [21]:
#separating english and dutch words 
eng_word = []
nld_word = []
for word in uncleaned_data_list:
    eng_word.append(word.split('\t')[:-1][0])
    nld_word.append(word.split('\t')[:-1][1])

In [22]:
#creating pandas df with english words and their dutch equivalents
data = pd.DataFrame(columns=['English','Dutch'])
data['English'] = eng_word
data['Dutch'] = nld_word

In [23]:
data.to_csv('data.csv', index=False)

In [24]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,English,Dutch
0,Go.,Lopen!
1,Go.,Vooruit.
2,Hi.,Hoi.
3,Hi.,Hé!
4,Hi.,Hai!


In [25]:
eng_word[-1]

"If someone who doesn't know your background says that you sound like a native speaker, it means they probably noticed something about your speaking that made them realize you weren't a native speaker. In other words, you don't really sound like a native speaker."

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
#80% of data used for training, and 20% for testing the models
#Random state ensures that the splits that are generated are reproducible
train_ex, test_ex = train_test_split(data,test_size=0.2, random_state=42)

In [28]:
eng_train = train_ex['English'].values
nld_train = train_ex['Dutch'].values
eng_test = test_ex['English'].values
nld_test = test_ex['Dutch'].values

In [29]:
#we get the slices of the arrays as objects by using tf.data.Dataset.from_tensor_slices() 
train_ex = tf.data.Dataset.from_tensor_slices((eng_train,nld_train))
test_ex = tf.data.Dataset.from_tensor_slices((eng_test,nld_test))

2022-07-30 04:22:55.298627: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-30 04:22:55.298651: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [30]:
#tf.compat.as_text() converts any string-like python input types to unicode.
for en, nld in train_ex.take(1):
  print(tf.compat.as_text(en.numpy()))
  print(tf.compat.as_text(nld.numpy()))

They walked.
Ze hebben gelopen.


# Preprocessing and Tokenization

In [49]:
def to_unicode(text):
    #strings are stored as unicode in Python 3.0 and above
    if isinstance(text,str):
        return text
    #conversion of bytes to unicode
    elif isinstance(text,bytes):
        return text.decode('utf-8','ignore')
    else:
        raise ValueError("Unsupported string type: %s" % (type(text)))

In [47]:
#load vocab file into a dictionary
def load_dict(vocab_file):
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, "r") as reader:
        while True:
            token = to_unicode(reader.readline())
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab

In [33]:
# removes whitespace from text and returns tokens
def remove_whitespace(text):
    text = text.strip()
    if not text:
        return []
    token = text.split()
    return token

In [34]:
def convert_vocab(vocab,items):
    output = []
    for item in items:
        output.append(vocab[item])
    return output

In [53]:
# for end-to-end tokenization
class FullTokenizer(object):
    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_dict(vocab_file)
        # map IDs to tokens
        self.invert_vocab = {v: k for k, v in self.vocab.items()}
        # basic tokenizer to break text into tokens based on whitespace
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        # Wordpiece tokenizer to generate sub-tokens out of the tokens from BasicTokenizer
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
    
    # uses Wordpiece to tokenize tokens generated by the BasicTokenizer
    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)
        return split_tokens
    
    # after being provided the tokens, outputs the IDs of the tokens
    def convert_tokens_to_ids(self, tokens):
        return convert_vocab(self.vocab, tokens)
    
    # after being provided the IDs, outputs the tokens mapped to the IDs
    def convert_ids_to_tokens(self, ids):
        return convert_vocab(self.invert_vocab, ids)

In [51]:
# tokenizes based on whitespace and punctuation, also lower-cases the tokens
class BasicTokenizer(object):
    
    # whether or not to lower-case the tokens
    def __init__(self, do_lower_case=True):
        self.do_lower_case = do_lower_case

    # tokenizes the text
    def tokenize(self, text):
        text = to_unicode(text)
        text = self.text_clean(text)
        text = self._tokenize_chinese_chars(text)

        orig_tokens = remove_whitespace(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = remove_whitespace(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    # adds whitespace around CJK characters
    def _tokenize_chinese_chars(self, text):
        output = []
        for char in text:
            # returns unicode for the codepoint
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)
    
    # checks if codepoint is a CJK character
    def _is_chinese_char(self, cp):
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True

        return False
    
    # removes invalid characters and cleans up whitespaces
    def text_clean(self, text):
        output = []
        for char in text:
            # get unicode of character
            cp = ord(char)
            # check if character is NULL, replacement character, or control character
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            # check if character is a whitespace character
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)


In [37]:
# tokenization based on Wordpiece
class WordpieceTokenizer(object):

    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
        self.vocab = vocab
        # unk_token represents an unknown word that is not present in the vocabulary
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word
    
    # tokenizes text into word pieces
    def tokenize(self, text):
        """
        A very similar algorithm to BPE.
        BPE is a compression algorithm which replaces consecutive bytes of data with a byte that does not occur
        in the data. It does so by mapping individual characters of text to their frequency, inculding an EOW token.
        In each further iteration, the most frequent pairing or characters or group of characters from the table
        is merged together until token limit or iteration limit is reached.
        
        In WordPiece tokenization, the only difference is how the pairing to be merged is selected. At each 
        iterative step, WordPiece chooses a symbol pair which will result in the largest increase in likelihood 
        upon merging. P(AB)/[P(A)*P(B)]
        The time complexity is O(K²) where K is the number of current word units in the table.
        While we use probability, the algorithm is still greedy. For a probabilistic approach, the unigram tokenizer
        is used.
        
        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]
        Here '##' indicates that the token is a suffix, and should be used in that context. 
    
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.
    
        Returns:
          A list of wordpiece tokens.
        """

        text = to_unicode(text)

        output_tokens = []
        for token in remove_whitespace(text):
            # splits token into chars
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                # finds longest substring from start of remaining word which is in vocab
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                # if no such substring, the token is not in vocab
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                # find next substring
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

In [38]:
# checks if char is whitespace 
def _is_whitespace(char):
    # \t, \n, and \r are technically controll characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    # returns category of char
    cat = unicodedata.category(char)
    # char of category space separator
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    # returns category of char
    cat = unicodedata.category(char)
    # char of category control, format, private use, or surrogate
    if cat.startswith("C"):
        return True
    return False

# checks if char is a punctuation
def _is_punctuation(char):
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    # returns category of char
    cat = unicodedata.category(char)
    # char of category punctuation
    if cat.startswith("P"):
        return True
    return False

### Create a custom subwords tokenizer from the training dataset for the decoder.
The encoder uses BERT Tokenizer

In [45]:
import os
vocab_file = 'vocab_nld'
# The vocabulary is "trained" on a corpus and all wordpieces are stored in a vocabulary file
if os.path.isfile(vocab_file + '.subwords'):
    # Invertible TextEncoder using word pieces with a byte-level fallback
    tokenizer_nld = tfds.deprecated.text.SubwordTextEncoder.load_from_file(vocab_file)
else: 
    # if vocab file not stored, build it
    tokenizer_nld = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
      (nld_train), target_vocab_size=2 ** 15)
    tokenizer_nld.save_to_file('vocab_nld')

sample_string = 'Mooie kale man.'
tokenized_string = tokenizer_nld.encode(sample_string)
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_nld.decode([ts])))

17285 ----> Mooie 
14012 ----> kale 
580 ----> man
18097 ----> .


In [59]:
# english tokenizer using the custom FullTokenizer
tokenizer_en = FullTokenizer(
    vocab_file= 'vocab_en.txt',
    do_lower_case=True)

test_tokens = tokenizer_en.tokenize(eng_train[-1])
# [CLS] token is used to indicate the task we want BERT to perform is next-sentence prediction, and not mask
# word prediction. We can think about the output of [CLS] as a probability. Used to oraganize tasks as [CLS]
# and [MASK]. [SEP] is also used for next-sentence predication tasks. It helps the model distinguish one 
# sentence from the next.
test_ids = tokenizer_en.convert_tokens_to_ids(['[CLS]'] + test_tokens + ['[SEP]'])
print(test_ids)
print(tokenizer_en.convert_ids_to_tokens(test_ids))

[101, 2498, 2064, 2022, 2589, 1012, 102]
['[CLS]', 'nothing', 'can', 'be', 'done', '.', '[SEP]']


In [66]:
MAX_SEQ_LENGTH = 50

def encode(en, nld, seq_length=MAX_SEQ_LENGTH):
    # converts Python input text to unicode
    tokens_en = tokenizer_en.tokenize(tf.compat.as_text(en.numpy()))
    lang1 = tokenizer_en.convert_tokens_to_ids(['[CLS]'] + tokens_en + ['[SEP]'])
    if len(lang1)<seq_length:
        # makes tokens equal to length of seq_length
        lang1 = lang1 + list(np.zeros(seq_length - len(lang1), 'int32'))
    # ???
    lang2 = [tokenizer_nld.vocab_size] + tokenizer_nld.encode(tf.compat.as_text(nld.numpy())) + [tokenizer_nld.vocab_size + 1]
    if len(lang2)<seq_length:
        lang2 = lang2 + list(np.zeros(seq_length - len(lang2), 'int32'))

    return lang1, lang2

In [64]:
# ???
def tf_encode(en, nld):
    # tf.py_function makes it possible to express control flow using Python constructs (if, while, for, etc.), 
    # instead of TensorFlow control flow constructs (tf.cond, tf.while_loop). 
    result_en, result_nld = tf.py_function(encode, [en, nld], [tf.int32, tf.int32])
    # ???
    result_en.set_shape([None])
    result_nld.set_shape([None])

    return result_en, result_nld

In [68]:
# ???
def filter_max_length(x, y, max_length=MAX_SEQ_LENGTH):
    # element-wise AND of its arguments
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [69]:
BUFFER_SIZE = 40000
BATCH_SIZE = 64

#applies tf_encode, filter_max_length functions to entire dataset
train_dataset = train_ex.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)

# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
# maintains a fixed-size buffer and chooses the next element uniformly at random from that buffer.
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]), drop_remainder=True)
# creates a Dataset that prefetches elements from this dataset. This allows later elements to 
# be prepared while the current element is being processed. 
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = test_ex.map(
    lambda en, nld: tf.py_function(encode, [en, nld], [tf.int32, tf.int32]))
test_dataset = test_dataset.filter(filter_max_length)
test_dataset = test_dataset.padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))

# Positional Encoding
The positional encoding vector is added to the embedding vector. Embeddings represent a token in a 
d-dimensional space where tokens with similar meaning will be closer to each other. But the embeddings do 
not encode the relative position of words in a sentence. So after adding the positional encoding, words will
be closer to each other based on the similarity of their meaning and their position in the sentence, in the 
d-dimensional space.