In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
import string
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from transformers import TFAutoModel




In [3]:

path_to_data = '/content/ara_eng.txt'

translation_file = open(path_to_data, "r", encoding='utf-8')
raw_data = translation_file.read()
translation_file.close()

raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in raw_data]
pairs = pairs[1000:11000]

def clean_sentence(sentence):
    lower_case_sent = sentence.lower()
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))

    return clean_sentence

def tokenize(sentences):
    text_tokenizer = Tokenizer()
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

english_sentences = [clean_sentence(pair[0]) for pair in pairs]
arabic_sentences = [clean_sentence(pair[1]) for pair in pairs]

eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)
ara_text_tokenized, ara_text_tokenizer = tokenize(arabic_sentences)

print('Maximum length English sentence: {}'.format(len(max(eng_text_tokenized, key=len))))
print('Maximum length Arabic sentence: {}'.format(len(max(ara_text_tokenized, key=len))))

english_vocab = len(eng_text_tokenizer.word_index) + 1
arabic_vocab = len(ara_text_tokenizer.word_index) + 1
print("English vocabulary size: {}".format(english_vocab))
print("Arabic vocabulary size: {}".format(arabic_vocab))

max_english_len = int(len(max(eng_text_tokenized, key=len)))
max_arabic_len = int(len(max(ara_text_tokenized, key=len)))

eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding="post")
ara_pad_sentence = pad_sequences(ara_text_tokenized, max_arabic_len, padding="post")

eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)
ara_pad_sentence = ara_pad_sentence.reshape(*ara_pad_sentence.shape, 1)

input_sequence = Input(shape=(max_english_len,), dtype='int32')
embedding = Embedding(input_dim=english_vocab, output_dim=128)(input_sequence)

transformer_model = TFAutoModel.from_pretrained("bert-base-uncased")

encoder_outputs = transformer_model(input_sequence)[0]

decoder_inputs = Input(shape=(max_arabic_len,), dtype='int32')
decoder_embedding = Embedding(input_dim=arabic_vocab, output_dim=128)(decoder_inputs)

encoder_lstm = LSTM(64, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(embedding)

decoder_lstm = LSTM(64, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

decoder_dense = Dense(arabic_vocab, activation='softmax')
output = decoder_dense(decoder_outputs)

enc_dec_model = Model([input_sequence, decoder_inputs], output)
optimizer = Adam(learning_rate=0.001)
enc_dec_model.compile(optimizer=optimizer, loss=sparse_categorical_crossentropy, metrics=['accuracy'])
enc_dec_model.summary()

batch_size = 32
num_batches = len(eng_pad_sentence) // batch_size

val_size = 1000
eng_pad_val = eng_pad_sentence[-val_size:]
ara_pad_val = ara_pad_sentence[-val_size:]

num_epochs = 20
for epoch in range(num_epochs):
    total_loss = 0
    total_accuracy = 0
    
    for batch in range(num_batches):
        start_index = batch * batch_size
        end_index = (batch + 1) * batch_size
        eng_batch = eng_pad_sentence[start_index:end_index]
        ara_batch = ara_pad_sentence[start_index:end_index]

        loss, accuracy = enc_dec_model.train_on_batch([eng_batch, ara_batch], ara_batch)

        total_loss += loss
        total_accuracy += accuracy
    
    # Calculate validation loss and accuracy
    val_loss, val_accuracy = enc_dec_model.evaluate([eng_pad_val, ara_pad_val], ara_pad_val, verbose=0)
    avg_loss = total_loss / num_batches
    avg_accuracy = total_accuracy / num_batches

    print("Epoch: {}/{} - Avg. Loss: {:.4f} - Avg. Accuracy: {:.4f} - Val Loss: {:.4f} - Val Accuracy: {:.4f}".format(
        epoch + 1, num_epochs, avg_loss, avg_accuracy, val_loss, val_accuracy))


Maximum length English sentence: 20
Maximum length Arabic sentence: 17
English vocabulary size: 4086
Arabic vocabulary size: 11891


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 17)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 20, 128)      523008      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 17, 128)      1522048     ['input_2[0][0]']                
                                                                                              

In [6]:
  # Function to convert logits to a sentence
def logits_to_sentence(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = ''
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# Example sentence translation
index = 980
print("The English sentence is: {}".format(english_sentences[index]))
print("The Arabic sentence is: {}".format(arabic_sentences[index]))
print('The predicted Arabic sentence is:')
predicted_sentence = logits_to_sentence(
    enc_dec_model.predict([eng_pad_sentence[index:index + 1], ara_pad_sentence[index:index + 1]])[0],
    ara_text_tokenizer)
print(predicted_sentence)

The English sentence is: the check please
The Arabic sentence is: الفاتورة من فضلك
The predicted Arabic sentence is:
الفاتورة من فضلك              
