# Without Hugging Face Transformers

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import collections
import os
import re
import keras.backend as K
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
os.listdir('/kaggle/input/language-translation-englishfrench')

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [None]:
df=pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")
df["French words/sentences"]=("<SOS> "+df["French words/sentences"]+" <EOS>")
df["English words/sentences"]=("<SOS> "+df["English words/sentences"]+" <EOS>")
df=df.sample(frac=1).reset_index(drop=True)
df

In [None]:
df["French word numbers"]=(df['English words/sentences'].str.split().apply(len))
df["English word numbers"]=(df['French words/sentences'].str.split().apply(len))

In [None]:
df.head()

In [None]:
data_to_plot = df[["French word numbers", "English word numbers"]]
sns.boxplot(data=data_to_plot)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(data_to_plot, bins=15, alpha=1)
plt.xlim(0, 40)

In [None]:
eng = df['English words/sentences']
fr = df['French words/sentences']

In [None]:
english_words_counter = collections.Counter([word for sentence in eng for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in fr for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in eng for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in fr for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

In [None]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [None]:
def pad(x, length=14):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

In [None]:
def clean_text(text):
    cleaned_texts=[]
    for sent in text:
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', sent)
        cleaned_texts.append(cleaned_text)
    return cleaned_texts

In [None]:
def preprocess(x, y):
#     cleaned_x=remove_stop(x,"english")
#     cleaned_y=remove_stop(y,"french")

    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [None]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(eng, fr)

In [None]:
preproc_english_sentences.shape,preproc_french_sentences.shape

In [None]:
preproc_english_sentences[0]

In [None]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

In [None]:
class positional_encoding(tf.keras.layers.Layer):
    def __init__(self,max_sentence_len,embedding_size,**kwargs):
        super().__init__(**kwargs)
        
        self.pos=np.arange(max_sentence_len).reshape(1,-1).T
        self.i=np.arange(embedding_size/2).reshape(1,-1)
        self.pos_emb=np.empty((1,max_sentence_len,embedding_size))
        self.pos_emb[:,:,0 : :2]=np.sin(self.pos / np.power(10000, (2 * self.i / embedding_size)))
        self.pos_emb[:,:,1 : :2]=np.cos(self.pos / np.power(10000, (2 * self.i / embedding_size)))
        self.positional_embedding = tf.cast(self.pos_emb,dtype=tf.float32)
        
    def call(self, inputs):
        return inputs + self.positional_embedding

In [None]:
class paddding_mask(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
    def call(self,inputs):
        mask=1-tf.cast(tf.math.equal(inputs,0),tf.float32)
        return mask[:, tf.newaxis, :] 

In [None]:
def test():
    a = positional_encoding(5,10)
    print(a.pos)
    print(a.i)
    print(a.pos_emb)
    print(a(np.ones((1,5,10))))
    b = paddding_mask()
    print(b([[1,2,3,4,0,0,0,1]]))
test()

In [None]:
paddding_mask()([[1,2,3,0]])

In [None]:
class create_look_ahead_mask(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
    def call(self,sequence_length):
        mask = tf.linalg.band_part(tf.ones((1, sequence_length, sequence_length)), -1, 0)
        return mask 

In [None]:
create_look_ahead_mask()(5)

In [None]:
class input_layer_encoder(tf.keras.layers.Layer):
    def __init__(self,max_sentence_len,embedding_size,vocab_size,**kwargs):
        super().__init__(**kwargs)
        self.paddding_mask=paddding_mask()
        
        self.embedding=tf.keras.layers.Embedding(vocab_size,
                                                 embedding_size,
                                                 input_length=max_sentence_len,
                                                 input_shape=(max_sentence_len,))
        
        self.positional_encoding=positional_encoding(max_sentence_len,embedding_size)
    def call(self,inputs):
        mask=self.paddding_mask(inputs)
        
        emb=self.embedding(inputs)
        
        emb=self.positional_encoding(emb)
        return emb,mask

In [None]:
def test_e():
    a = input_layer_encoder(5,10,10)
    print(a(np.array([[6,7,8,1,4]])))
    print(a.positional_encoding.positional_embedding)
test_e()

In [None]:
class input_layer_decoder(tf.keras.layers.Layer):
    def __init__(self,max_sentence_len,embedding_size,vocab_size,**kwargs):
        super().__init__(**kwargs)
        self.paddding_mask=paddding_mask()
        
        self.embedding=tf.keras.layers.Embedding(vocab_size,
                                                 embedding_size,
                                                 input_length=max_sentence_len,
                                                 input_shape=(max_sentence_len,))
        
        self.positional_encoding=positional_encoding(max_sentence_len,embedding_size)
        
        self.look_ahead_mask=create_look_ahead_mask()
        self.max_sentence_len=max_sentence_len
    def call(self,inputs):
        mask=self.paddding_mask(inputs)
        
        emb=self.embedding(inputs)
        
        emb=self.positional_encoding(emb)
        
        look_head_mak=self.look_ahead_mask(self.max_sentence_len)
        look_head_mak=tf.bitwise.bitwise_and(tf.cast(look_head_mak,dtype=np.int8),tf.cast(mask,dtype=np.int8))
        return emb,look_head_mak

In [None]:
def test_d():
    a = input_layer_decoder(5,10,10)
    print(a(np.ones((1,5))))
test_d()

In [None]:
class Encoder_layer(tf.keras.layers.Layer):
    def __init__(self,
                 embedding_size,
                 heads_num,
                 dense_num,
                 dropout_rate=0.0,
                 **kwargs):
        
        super().__init__(**kwargs)
        
        
        self.multi_attention=tf.keras.layers.MultiHeadAttention(
                num_heads=heads_num,
                key_dim=embedding_size,
                dropout=dropout_rate,
            )
        
        self.Dropout=tf.keras.layers.Dropout(dropout_rate)
        
        self.ff=tf.keras.Sequential([
            tf.keras.layers.Dense(dense_num,activation="relu"),
            tf.keras.layers.Dense(dense_num,activation="relu"),
            tf.keras.layers.Dense(dense_num,activation="relu"),
            tf.keras.layers.Dense(embedding_size,activation="relu"),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        
        self.add=tf.keras.layers.Add()
        
        self.norm1=tf.keras.layers.LayerNormalization()
        self.norm2=tf.keras.layers.LayerNormalization()
    def call(self,inputs,mask,training):
        
        mha=self.multi_attention(inputs,inputs,inputs,mask)
        
        norm=self.norm1(self.add([inputs,mha]))
        
        fc=self.ff(norm)
        
        A=self.Dropout(fc,training=training)
        
        output=self.norm2(self.add([A,norm]))
        
        return output

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,
                 max_sentence_len,
                 embedding_size,
                 vocab_size,
                 heads_num,
                 dense_num,
                 num_of_encoders,
                 **kwargs):
        super().__init__(**kwargs)
        self.add=tf.keras.layers.Add()
        self.input_layer=input_layer_encoder(max_sentence_len,embedding_size,vocab_size)
        self.encoder_layer=[Encoder_layer(embedding_size,heads_num, dense_num) for i in range (num_of_encoders)]
        self.num_layers=num_of_encoders
    def call(self,inputs,training):
        emb,mask=self.input_layer(inputs)
        skip=emb
        for layer in self.encoder_layer:
            emb = layer(emb, mask,training)
            emb = self.add([skip,emb])
            skip = emb
        return emb,mask

In [None]:
class decoder_layer(tf.keras.layers.Layer):
    def __init__(self,
                 embedding_size,
                 heads_num,
                 dense_num,
                 dropout_rate=0.0,
                 **kwargs):
        
        super().__init__(**kwargs)
            
        self.masked_mha=tf.keras.layers.MultiHeadAttention(
                num_heads=heads_num,
                key_dim=embedding_size,
                dropout=dropout_rate,
            )
        
        
        self.multi_attention=tf.keras.layers.MultiHeadAttention(
                num_heads=heads_num,
                key_dim=embedding_size,
                dropout=dropout_rate,
            )
        
        self.ff=tf.keras.Sequential([
            tf.keras.layers.Dense(dense_num,activation="relu"),
            tf.keras.layers.Dense(dense_num,activation="relu"),
            tf.keras.layers.Dense(dense_num,activation="relu"),
            tf.keras.layers.Dense(embedding_size,activation="relu"),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        
        self.Dropout=tf.keras.layers.Dropout(dropout_rate)
        self.add=tf.keras.layers.Add()
        
        self.norm1=tf.keras.layers.LayerNormalization()
        self.norm2=tf.keras.layers.LayerNormalization()
        self.norm3=tf.keras.layers.LayerNormalization()
        
    def call(self,inputs,encoder_output,enc_mask,look_head_mask,training):
        
        mha_out,atten_score=self.masked_mha(inputs,inputs,inputs,look_head_mask,return_attention_scores=True)
        
        Q1=self.norm1(self.add([inputs,mha_out]))
        
        mha_out2,atten_score2=self.multi_attention(Q1,encoder_output,encoder_output,enc_mask,return_attention_scores=True)
        
        Z=self.norm2(self.add([Q1,mha_out2]))
        
        fc =  self.ff(Z)
        
        A=self.Dropout(fc,training=training)
        
        output=self.norm3(self.add([A,Z]))
        return output

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,
                 max_sentence_len,
                 embedding_size,
                 vocab_size,
                 heads_num,
                 dense_num,
                 num_of_decoders,
                 **kwargs):
        super().__init__(**kwargs)
        self.add=tf.keras.layers.Add()
        self.input_layer=input_layer_decoder(max_sentence_len,embedding_size,vocab_size)
        self.decoder_layer=[decoder_layer(embedding_size,heads_num, dense_num) for i in range (num_of_decoders)]
        self.num_layers=num_of_decoders
    def call(self,inputs,encoder_output,enc_mask,training):
        emb,look_head_mask=self.input_layer(inputs)
        skip=emb
        for layer in self.decoder_layer:
            emb = layer(emb,encoder_output,enc_mask,look_head_mask,training)
            emb = self.add([skip,emb])
            skip = emb
        return emb

In [None]:
class transformer(tf.keras.Model):
    def __init__(self,
                 max_sentence_len_1=None,max_sentence_len_2=None,embedding_size=None,vocab_size1=None,vocab_size2=None,
                         heads_num=None,dense_num=None,num_of_encoders_decoders=None):

        super(transformer,self).__init__()

        self.Encoder=Encoder(max_sentence_len_1,embedding_size,vocab_size1,heads_num,dense_num,num_of_encoders_decoders)
        self.Decoder=Decoder(max_sentence_len_2,embedding_size,vocab_size2,heads_num,dense_num,num_of_encoders_decoders)
        self.Final_layer=tf.keras.layers.Dense(vocab_size2, activation='relu')
        self.softmax=tf.keras.layers.Softmax(axis=-1)
    def call(self, inputs):
        input_sentence,output_sentence=inputs
        enc_output,enc_mask=self.Encoder(input_sentence)

        dec_output=self.Decoder(output_sentence,enc_output,enc_mask)

        final_out=self.Final_layer(dec_output)

        softmax_out=self.softmax(final_out)
        return softmax_out

In [None]:
tran=transformer(max_sentence_len_1=14,
                     max_sentence_len_2=13,
                     embedding_size=300,
                     vocab_size1=french_vocab_size+1,
                     vocab_size2=english_vocab_size+1,
                     heads_num=5,
                     dense_num=512,
                     num_of_encoders_decoders=2)

In [None]:
tran((preproc_french_sentences[:1],preproc_english_sentences[:1,:-1]))

In [None]:
tran.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
             metrics=["accuracy"])

In [None]:
tran.fit((preproc_french_sentences,preproc_english_sentences[:,:-1]),
         preproc_english_sentences[:,1:,tf.newaxis],
         epochs=10, verbose = True,
         batch_size=1024)

In [None]:
preproc_french_sentences[0][np.newaxis,...].shape

In [None]:
fr_tk

In [None]:
tran.predict((preproc_english_sentences[0][np.newaxis,...],french_tokenizer.texts_to_sequences('<SOS>')))

In [None]:
a = Encoder_layer(10,3,10)

In [None]:
tf.keras.models.save_model(tran,'/kaggle/working/Transformer_en_fr.h5',save_format="tf")

In [None]:
def prepare_pred(sent):
    output=english_tokenizer.texts_to_sequences(sent)
    output=pad(output,13)
    return output

In [None]:
def pred(i):
    sent=["<SOS>"]
    french_token=prepare_pred(sent)
    word=np.argmax(tran.predict((preproc_french_sentences[[i]],french_token),verbose=0),-1)[0,0]
    sent[0]=sent[0]+ " "+english_tokenizer.sequences_to_texts(np.array([[word]]))[0]
    for j in range(1,12):
        french_token=prepare_pred(sent)
        word=np.argmax(tran.predict((preproc_french_sentences[[i]],french_token),verbose=0),-1)[0,j]
        sent[0]=sent[0]+ " "+english_tokenizer.sequences_to_texts(np.array([[word]]))[0]
        if english_tokenizer.sequences_to_texts(np.array([[word]]))[0]=="eos":
            break
    return sent

In [None]:
french_tokenizer.texts_to_sequences(["comment allez-vous"])

In [None]:
sent=["hello"]
french_token=prepare_pred(sent)
word=np.argmax(tran.predict((pad(french_tokenizer.texts_to_sequences(["je ne peux pas garder ceci"])),french_token),verbose=0),-1)[0,0]
sent[0]=sent[0]+ " "+english_tokenizer.sequences_to_texts(np.array([[word]]))[0]
for j in range(1,12):
    french_token=prepare_pred(sent)
    word=np.argmax(tran.predict((preproc_french_sentences[[i]],french_token),verbose=0),-1)[0,j]
    sent[0]=sent[0]+ " "+english_tokenizer.sequences_to_texts(np.array([[word]]))[0]
    if english_tokenizer.sequences_to_texts(np.array([[word]]))[0]=="eos":
        break

In [None]:
french_tokenizer.sequences_to_texts([preproc_french_sentences[100]])

In [None]:
english_tokenizer.sequences_to_texts([preproc_english_sentences[100]])

In [None]:
sent

In [None]:
import random

In [None]:
def show():
    i=random.randint(0,170111)
    print("french sent : ",french_tokenizer.sequences_to_texts(preproc_french_sentences[[i]]))
    print("predict sent : ",pred(i))
    print("true sent : ",english_tokenizer.sequences_to_texts(preproc_english_sentences[[i]]))

In [None]:
for i in range(10):
    show()
    print("----------------")

# With Hugging Face Transformers

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/kaggle/input/language-translation-englishfrench/eng_-french.csv')

In [None]:
data.head()

In [None]:
data = data.sample(frac=1).reset_index(drop=True)
data.head()


In [None]:
input_texts = data['English words/sentences'][:1000]
target_texts = data['French words/sentences'][:1000]

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [None]:
input_encodings = tokenizer(list(input_texts), return_tensors="tf", padding=True, truncation=True)
target_encodings = tokenizer(list(target_texts), return_tensors="tf", padding=True, truncation=True)

In [None]:
input_encodings

In [None]:
target_encodings

In [None]:
decoder_input_ids = target_encodings["input_ids"]

In [None]:
decoder_input_ids

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(input_encodings), decoder_input_ids)
)


In [None]:
train_dataset

In [None]:
batch_size = 32
train_dataset = train_dataset.batch(batch_size)

# Training settings
num_epochs = 3
learning_rate = 1e-4

# Optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in train_dataset:
        inputs = batch[0]
        decoder_input_ids = batch[1]

        with tf.GradientTape() as tape:
            outputs = model(inputs, decoder_input_ids=decoder_input_ids)
            logits = outputs.logits

            # Calculate loss
            loss = loss_fn(decoder_input_ids, logits)
            total_loss += loss.numpy()

        # Update model weights
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    average_loss = total_loss / len(train_dataset)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}")

# Save the trained model and tokenizer
model.save_pretrained("translation_model_tf")
tokenizer.save_pretrained("translation_model_tf")

In [None]:
input_text = " want you to wear this one."

# Tokenize the input text
input_encoding = tokenizer(input_text, return_tensors="tf", padding=True, truncation=True)

# Make predictions
with tf.device('/CPU:0'):  # Adjust device as needed
    output_ids = model.generate(input_encoding["input_ids"])

# Decode the generated output
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Input Text:", input_text)
print("Generated Translation:", output_text)

In [None]:
input_texts