## In this tutorial, we'll use three different English translations of the same work, Homer's Illiad, and train a model to identify the translator given a single line of text.

In [None]:
%pip install tensorflow-gpu

In [None]:
import tensorflow as tf 
import os
import tensorflow_datasets as tfds

In [None]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

In [None]:
for name in FILE_NAMES:
  text_dir=tf.keras.utils.get_file(name,DIRECTORY_URL+name)

parent_dir=os.path.dirname(text_dir)
print(parent_dir)

/root/.keras/datasets


#Load text into datasets
Iterate through the files, loading each one into its own dataset.

Each example needs to be labeled individually labeled, so use tf.data.Dataset.map to apply a labeler function to each one. This will iterate over every example in the dataset, returning (example, label) pairs.

In [None]:
def labeler(example,label):
  return example,tf.cast(label,tf.int64)

labeled_data_sets=[]

for i,file_name in enumerate(FILE_NAMES):
  lines_dataset=tf.data.TextLineDataset(os.path.join(parent_dir,file_name))
  labeled_dataset=lines_dataset.map(lambda ex: labeler(ex,i))
  labeled_data_sets.append(labeled_dataset)

In [None]:
print(labeled_data_sets)

[<MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>, <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>, <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>]


Combine these labeled datasets into a single dataset, and shuffle it.

In [None]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [None]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [None]:
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b'The needful strength, or, scatheless yet, withdraw;'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'From his detested throat, but all around'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'occasion. Iris appears to Achilles by command of Juno, and orders him'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Companions of chill fear, from heaven infused,'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'pain and grant me strength both to cheer on the Lycians and to fight'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)


#Encode text lines as numbers
Machine learning models work on numbers, not words, so the string values need to be converted into lists of numbers. To do that, map each unique word to a unique integer.

#Build vocabulary
First, build a vocabulary by tokenizing the text into a collection of individual unique words. There are a few ways to do this in both TensorFlow and Python. For this tutorial:

Iterate over each example's numpy value.
Use tfds.features.text.Tokenizer to split it into tokens.
Collect these tokens into a Python set, to remove duplicates.
Get the size of the vocabulary for later use.

In [None]:
tokenizer=tfds.features.text.Tokenizer()

vocabulary_set=set()

for text_tensor,_ in all_labeled_data:
  some_tokens=tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

In [None]:
print(vocabulary_set)
print("Length of vocabulary set",len(vocabulary_set))

Length of vocabulary set 17178


# Encode vocabulary 
We will perform encoding on the vocabulary set

In [None]:
encoder=tfds.features.text.TokenTextEncoder(vocabulary_set)

In [None]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

all_encoded_data = all_labeled_data.map(encode_map_fn)

Split dataset into training and testing dataset

In [None]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

Now we calculate the vocabulary size and add 1 to it since we added padding to text

In [None]:
vocab_size=len(vocabulary_set)+1

Building the model of dataset

In [None]:
model=tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [None]:
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3, activation='softmax'))

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

Now training and validating dataset with the model.

In [None]:
model.fit(train_data,epochs=10,validation_data=test_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f85102953c8>

In [None]:
eval_loss, eval_acc = model.evaluate(test_data)
print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


Eval loss: 0.793, Eval accuracy: 0.829
