In [1]:
import tensorflow as tf
import numpy as np, cv2, io, os, re, string, time, datetime
import seaborn as sns, sklearn
import matplotlib.pyplot as plt
from keras.layers import (TextVectorization)

In [2]:
#@ Downloading datasets:
!wget https://www.manythings.org/anki/fra-eng.zip

--2025-01-10 16:21:45--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2025-01-10 16:21:48 (3.71 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]



In [3]:
!unzip '/content/fra-eng.zip' -d '/content/dataset' # -d flag specifies directories

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


#### Data Preprocessing

In [4]:
text_dataset=tf.data.TextLineDataset('/content/dataset/fra.txt') #each line is treated as separate string

In [5]:
for i in text_dataset.take(3):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)


In [6]:
#@ Setting up the Parameters:
VOCAB_SIZE=20000 #unique tokens from dataset, setting value 20000 for efficiency
ENGLISH_SEQUENCE_LENGTH=32 #max length of i/p sequence[in tokens]
FRENCH_SEQUENCE_LENGTH=32 #max len of o/p sequence[in tokens]
EMBEDDINGS_DIM=512 #size of vectors to represent tokens(as per paper)
BATCH_SIZE=128 #for data size processed during training

In [7]:
#@ for english word:
english_vectorize_layer=TextVectorization(
                      standardize='lower_and_strip_punctuation',
                      max_tokens=VOCAB_SIZE,
                      output_mode='int', #mapping wrt to the integer index
                      output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)

#@ for french word:
french_vectorize_layer=TextVectorization(
                       standardize='lower_and_strip_punctuation',
                       max_tokens=VOCAB_SIZE,
                       output_mode='int',
                       output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [8]:
def seperator(input_text):
  split_text=tf.strings.split(input_text, '\t')
  return {
      'input_1':split_text[0:1],
      'input_2':'starttoken' + split_text[1:2]
      }, split_text[1:2]+' endtoken'

In [9]:
text='hello\tprijal'
seperator(text)


({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'hello'], dtype=object)>,
  'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenprijal'], dtype=object)>},
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'prijal endtoken'], dtype=object)>)

In [10]:
#@ Initializing dataset:
init_dataset=text_dataset.map(seperator)

In [11]:
for i in init_dataset.take(3):
  print(i)


({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenVa !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenMarche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenEn route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


### Vocab Creation

In [12]:
english_training_data=init_dataset.map(lambda x, y:x['input_1'])
english_vectorize_layer.adapt(english_training_data)

french_training_data=init_dataset.map(lambda x, y:y)
french_vectorize_layer.adapt(french_training_data)


In [20]:
#@ Grouping and  Vectorization for training:
def vectorizer(inputs, output):
  return {'input_1':english_vectorize_layer(inputs['input_1']),
          'input_2':french_vectorize_layer(inputs['input_2'])}, french_vectorize_layer(output)

In [21]:
init_dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'input_2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.string, name=None))>

In [22]:
dataset=init_dataset.map(vectorizer)

In [23]:
for i in init_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenVa !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenMarche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttokenEn route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [25]:
for i in dataset.take(1):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 32), dtype=int64, numpy=
array([[45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 32), dtype=int64, numpy=
array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>}, <tf.Tensor: shape=(1, 32), dtype=int64, numpy=
array([[103,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0]])>)


In [26]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [27]:
NUM_BATCHES=int(200000/BATCH_SIZE)

In [28]:
#@ Training and testing split
train_dataset=dataset.take(int(0.9*NUM_BATCHES))
val_dataset=dataset.skip(int(0.9*NUM_BATCHES))

### Model Architecture

In [30]:
#@ Positional Encoding:
def PositionalEncoding(d_model, SEQUENCE_LENGTH):
  output=[]
  for pos in range(SEQUENCE_LENGTH):
    PE=np.zeros(d_model)
    for i in range(d_model):
      if i % 2 == 0: #even position, sine formula is used
        PE[i]=np.sin(pos/(10000**(2*i/d_model)))
      else:
        PE[i]=np.cos(pos/(10000**(2*i/d_model)))
    output.append(tf.expand_dims(PE, axis=0))
  out=tf.concat(output, axis=0)
  out=tf.expand_dims(out, axis=0)
  return tf.cast(out, dtype=tf.float32)


In [31]:
print(PositionalEncoding(512, 32))

tf.Tensor(
[[[ 0.0000000e+00  1.0000000e+00  0.0000000e+00 ...  1.0000000e+00
    0.0000000e+00  1.0000000e+00]
  [ 8.4147096e-01  5.6969500e-01  8.0196178e-01 ...  1.0000000e+00
    1.0746079e-08  1.0000000e+00]
  [ 9.0929741e-01 -3.5089520e-01  9.5814437e-01 ...  1.0000000e+00
    2.1492157e-08  1.0000000e+00]
  ...
  [-6.6363388e-01 -9.5558822e-01  9.6020764e-01 ...  1.0000000e+00
    3.1163626e-07  1.0000000e+00]
  [-9.8803163e-01 -7.8659910e-01  3.4962672e-01 ...  1.0000000e+00
    3.2238236e-07  1.0000000e+00]
  [-4.0403765e-01  5.9345119e-02 -5.4249090e-01 ...  1.0000000e+00
    3.3312844e-07  1.0000000e+00]]], shape=(1, 32, 512), dtype=float32)
