In [1]:
import random
import tensorflow as tf
import numpy as np
import time
import json

def fix_random_seed(seed):
    """ Setting the random seed of various libraries """
    try:
        np.random.seed(seed)
    except NameError:
        print("Warning: Numpy is not imported. Setting the seed for Numpy failed.")
    try:
        tf.random.set_seed(seed)
    except NameError:
        print("Warning: TensorFlow is not imported. Setting the seed for TensorFlow failed.")
    try:
        random.seed(seed)
    except NameError:
        print("Warning: random module is not imported. Setting the seed for random failed.")
 
# Fixing the random seed
random_seed=4321
fix_random_seed(random_seed)

print("TensorFlow version: {}".format(tf.__version__))

TensorFlow version: 2.18.0


In [2]:
# Not setting this led to the following error
# _Derived_]RecvAsync is cancelled.   
# [[{{node gradient_tape/model_1/embedding_1/embedding_lookup/Reshape/_172}}]] [Op:__inference_train_function_31985]

%env TF_FORCE_GPU_ALLOW_GROWTH=true

env: TF_FORCE_GPU_ALLOW_GROWTH=true


In [3]:
import os
import requests
import zipfile

# Make sure the zip file has been downloaded
if not os.path.exists(os.path.join('data','deu-eng.zip')):
    raise FileNotFoundError(
        "Uh oh! Did you download the deu-eng.zip from http://www.manythings.org/anki/deu-eng.zip manually and place it in the Ch11/data folder?"
    )

else:
    if not os.path.exists(os.path.join('data', 'deu.txt')):
        with zipfile.ZipFile(os.path.join('data','deu-eng.zip'), 'r') as zip_ref:
            zip_ref.extractall('data')
    else:
        print("The extracted data already exists")

The extracted data already exists


In [4]:
import pandas as pd

# Read the csv file
df = pd.read_csv(os.path.join('data', 'deu.txt'), delimiter='\t', header=None)
# Set column names
df.columns = ["EN", "DE", "Attribution"]
df = df[["EN", "DE"]]
print('df.shape = {}'.format(df.shape))

clean_inds = [i for i in range(len(df)) if b"\xc2" not in df.iloc[i]["DE"].encode("utf-8")]
df = df.iloc[clean_inds]

print('(clean) df.shape = {}'.format(df.shape))

df.shape = (277891, 2)
(clean) df.shape = (277205, 2)


In [5]:
df.tail()

Unnamed: 0,EN,DE
277882,Remember that the purpose of the Tatoeba Proje...,"Es gilt zu bedenken, dass es das Anliegen des ..."
277883,"When I was younger, I hated going to weddings....","Als ich jünger war, hasste ich es, auf Hochzei..."
277884,If someone who doesn't know your background sa...,"Wenn jemand, der deine Herkunft nicht kennt, s..."
277885,If someone who doesn't know your background sa...,"Wenn jemand Fremdes dir sagt, dass du dich wie..."
277887,If someone who doesn't know your background sa...,"Wenn einem von jemandem, der nicht weiß, woher..."


In [6]:
n_samples = 50000
df = df.sample(n=n_samples, random_state=random_seed)

start_token = 'sos'
end_token = 'eos'

df["DE"] = start_token + ' ' + df["DE"] + ' ' + end_token

In [7]:
# Randomly sample 5000 examples from the total 50000 randomly
test_df = df.sample(n=int(n_samples/10), random_state=random_seed)
# Randomly sample 5000 examples from the total 50000 randomly
valid_df = df.loc[~df.index.isin(test_df.index)].sample(n=int(n_samples/10), random_state=random_seed)
# Assign the rest to training data
train_df = df.loc[~(df.index.isin(test_df.index) | df.index.isin(valid_df.index))]

print('test_df.shape = {}'.format(test_df.shape))
print('valid_df.shape = {}'.format(valid_df.shape))
print('train_df.shape = {}'.format(train_df.shape))

test_df.shape = (5000, 2)
valid_df.shape = (5000, 2)
train_df.shape = (40000, 2)


In [8]:
from collections import Counter

# Create a flattened list from English words
en_words = train_df["EN"].str.split().sum()
# Create a flattened list of German words
de_words = train_df["DE"].str.split().sum()

# Get the vocabulary size of words appearing more than or equal to 10 times
n=10

def get_vocabulary_size_greater_than(words, n, verbose=True):
    
    """ Get the vocabulary size above a certain threshold """
    
    # Generate a counter object i.e. dict word -> frequency
    counter = Counter(words)
    
    # Create a pandas series from the counter, then sort most frequent to least
    freq_df = pd.Series(list(counter.values()), index=list(counter.keys())).sort_values(ascending=False)
    
    if verbose:
        # Print most common words
        print(freq_df.head(n=10))

    # Count of words >= n frequent    
    n_vocab = (freq_df>=n).sum()
    
    if verbose:
        print("\nVocabulary size (>={} frequent): {}".format(n, n_vocab))
        
    return n_vocab

print("English corpus")
print('='*50)
en_vocab = get_vocabulary_size_greater_than(en_words, n)

print("\nGerman corpus")
print('='*50)
de_vocab = get_vocabulary_size_greater_than(de_words, n)

English corpus
Tom    9228
to     8700
I      8620
the    6766
you    6136
a      5741
is     4141
in     2639
of     2470
was    2380
dtype: int64

Vocabulary size (>=10 frequent): 2218

German corpus
sos      40000
eos      40000
Tom       9713
Ich       7964
ist       4735
nicht     4616
zu        3606
Sie       3441
du        3132
das       2987
dtype: int64

Vocabulary size (>=10 frequent): 2483


In [9]:
def print_sequence_length(str_ser):
    
    """ Print the summary stats of the sequence length """
    
    # Create a pd.Series, which contain the sequence length for each review
    seq_length_ser = str_ser.str.split(' ').str.len()

    # Get the median as well as summary statistics of the sequence length
    print("\nSome summary statistics")
    print("Median length: {}\n".format(seq_length_ser.median()))
    print(seq_length_ser.describe())
    
    # Get the quantiles at given marks
    print("\nComputing the statistics between the 1% and 99% quantiles (to ignore outliers)")
    p_01 = seq_length_ser.quantile(0.01)
    p_99 = seq_length_ser.quantile(0.99)
    
    # Print the summary stats of the data between the defined quantlies
    print(seq_length_ser[(seq_length_ser >= p_01) & (seq_length_ser < p_99)].describe())

print("English corpus")
print('='*50)
print_sequence_length(train_df["EN"])

print("\nGerman corpus")
print('='*50)
print_sequence_length(train_df["DE"])

English corpus

Some summary statistics
Median length: 6.0

count    40000.000000
mean         6.294025
std          2.542850
min          1.000000
25%          5.000000
50%          6.000000
75%          8.000000
max         44.000000
Name: EN, dtype: float64

Computing the statistics between the 1% and 99% quantiles (to ignore outliers)
count    39584.000000
mean         6.184671
std          2.284073
min          2.000000
25%          5.000000
50%          6.000000
75%          7.000000
max         14.000000
Name: EN, dtype: float64

German corpus

Some summary statistics
Median length: 8.0

count    40000.000000
mean         8.332250
std          2.536094
min          3.000000
25%          7.000000
50%          8.000000
75%         10.000000
max         52.000000
Name: DE, dtype: float64

Computing the statistics between the 1% and 99% quantiles (to ignore outliers)
count    39227.000000
mean         8.253116
std          2.231582
min          5.000000
25%          7.000000
50%    

In [10]:
print("EN vocabulary size: {}".format(en_vocab))
print("DE vocabulary size: {}".format(de_vocab))

# Define sequence lengths with some extra space for longer sequences
en_seq_length = 19
de_seq_length = 21

print("EN max sequence length: {}".format(en_seq_length))
print("DE max sequence length: {}".format(de_seq_length))

EN vocabulary size: 2218
DE vocabulary size: 2483
EN max sequence length: 19
DE max sequence length: 21


In [11]:
import tensorflow as tf
import os
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K


class DecoderRNNAttentionWrapper(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, cell, units, **kwargs):
        self._cell = cell
        self.units = units
        super().__init__(**kwargs)

    def build(self, input_shape):

        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.
        # Weight matrices of the attention layer
        # W_a is a [s, self.units] sized matrix
        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], self.units)),
                                   initializer='uniform',
                                   trainable=True)
        # U_a is a [h, self.units] sized matrix
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((self._cell.units, self.units)),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((self.units, 1)),
                                   initializer='uniform',
                                   trainable=True)
        
    
    # Code listing 12.1
    def call(self, inputs, initial_state, training=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        
        encoder_outputs, decoder_inputs = inputs

        def _step(inputs, states):
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * de_in_dim)
            states: [(batchsize * de_latent_dim)]
            """
            
            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg
            
            encoder_full_seq = states[-1]
            
            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_h = K.dot(encoder_full_seq, self.W_a)

            """ Computing hj.Ua """                        
            U_a_dot_s = K.expand_dims(K.dot(states[0], self.U_a), 1)  # <= batch_size, 1, latent_dim
                      
            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Wh_plus_Us = K.tanh(W_a_dot_h + U_a_dot_s)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Wh_plus_Us, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            """ Computing c_i using e_i and h_j (for all j) """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_full_seq * K.expand_dims(e_i, -1), axis=1)
            
            """ Concat the current input and c_i and feed it to the RNN """
            s, states = self._cell(K.concatenate([inputs, c_i], axis=-1), states)              
        
            return (s, e_i), states

        """ Computing outputs """
        
        last_out, attn_outputs, _ = K.rnn(
            _step, decoder_inputs, [initial_state], constants=[encoder_outputs]
        )

        # attn_out => (batch_size, de_seq_len, de_hidden_size)
        # attn_energy => (batch_size, de_seq_len, en_seq_len)
        attn_out, attn_energy = attn_outputs        

        return attn_out, attn_energy

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]
    
    def get_config(self):
        return {"units": self.units, "_cell": self._cell}