<a href="https://colab.research.google.com/github/rakibulhaque9954/Machine_Learning_Translation/blob/main/machine_translation_with_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
import cv2## image processing
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input, Embedding,TextVectorization)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

# Dataset Prepration and Preprocessing

## Dataset Download

### Small dataset manythings.org

In [2]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2023-10-21 11:49:30--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7757635 (7.4M) [application/zip]
Saving to: ‘fra-eng.zip’


2023-10-21 11:49:33 (3.88 MB/s) - ‘fra-eng.zip’ saved [7757635/7757635]



In [3]:
!unzip '/content/fra-eng.zip' -d '/content/dataset/'


Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


### Kaggle Dataset

In [None]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset
!unzip "/content/en-fr-translation-dataset.zip" -d "/content/dataset/"

In [None]:
dataset = tf.data.experimental.CsvDataset(
  "/content/dataset/en-fr.csv",
  [
    tf.string,
    tf.string
  ],
)

## Data Processing

In [64]:
text_dataset = tf.data.TextLineDataset('/content/dataset/fra.txt')



In [65]:
for i in text_dataset.take(5):
  print(i)



tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)
tf.Tensor(b'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)', shape=(), dtype=string)


In [66]:
VOCAB_SIZE = 20000
ENGLISH_SEQUENCE_LENGTH = 64 # explicitly initialising input and output length of sequences
FRENCH_SEQUENCE_LENGTH = 64
EMBEDDING_DIM = 300





In [67]:
english_vectorize_layer = tf.keras.layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=ENGLISH_SEQUENCE_LENGTH)




In [68]:
french_vectorize_layer = tf.keras.layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=FRENCH_SEQUENCE_LENGTH)

In [69]:
def selector(input_text):
  """[starttoken] and [endtoken] token initialized just as the model explanation the [endtoken] part is the output"""
  split_text = tf.strings.split(input_text, sep='\t')
  return {'input_1' : split_text[0 : 1], 'input_2' : '[starttoken] '+ split_text[1 : 2]}, split_text[1 : 2] + ' [endtoken]'





In [70]:
split_dataset = text_dataset.map(selector)




In [71]:
for i in split_dataset.take(1):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[starttoken] Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! [endtoken]'], dtype=object)>)


**As the data shows how the inputs have been seperated:**
- Input 1 - {'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.']>
- Input 2 - 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[starttoken] Va !']>
- Output - <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! [endtoken]']>

***The inputs and output has been processed for training, the split dataset is for training while init is for vocabulary creation.***

In [72]:
def selector(input_text):
  """This function is needed to create vocabulary"""
  split_text = tf.strings.split(input_text, sep='\t')
  return split_text[0 : 1], '[starttoken] ' + split_text[1 : 2] + ' [endtoken]' # combining input 2 and output the decoder part

In [73]:
init_dataset = text_dataset.map(selector)


In [74]:
for i in init_dataset.take(5):
  print(i)


(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[starttoken] Va ! [endtoken]'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[starttoken] Marche. [endtoken]'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[starttoken] En route ! [endtoken]'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[starttoken] Bouge ! [endtoken]'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Hi.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[starttoken] Salut ! [endtoken]'], dtype=object)>)


In [75]:
english_training_data = init_dataset.map(lambda x,y: x) # x is for eng
english_vectorize_layer.adapt(english_training_data)




In [76]:
french_training_data = init_dataset.map(lambda x,y: y) # y is for french
french_vectorize_layer.adapt(french_training_data)

In [77]:
def vectorizer(inputs, output):
  return {'input_1' : english_vectorize_layer(inputs['input_1']),
          'input_2' : french_vectorize_layer(inputs['input_2']) }, french_vectorize_layer(output)

# making the input_1 and input_2 together for inputs




In [78]:
dataset = split_dataset.map(vectorizer)





In [80]:
for i in dataset.take(1):
  print(i)



({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[44,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[  2, 103,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[103,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,

In [82]:
print(f'ENG: {english_vectorize_layer.get_vocabulary()[44]}')
print(f'FR: {french_vectorize_layer.get_vocabulary()[2]}')
print(f'FR: {french_vectorize_layer.get_vocabulary()[103]}')
print(f'FR: {french_vectorize_layer.get_vocabulary()[3]}')


ENG: go
FR: starttoken
FR: va
FR: endtoken


# Modeling

## Seq2Seq Model
<h4>Model Diagram<h4>
<img src='https://miro.medium.com/max/942/1*KtWwvLK-jpGPSnj3tStg-Q.png'>