# Using Pre-Trained Model

In [None]:
!pip install -q transformers sentencepiece nltk rouge-score

from transformers import MarianMTModel, MarianTokenizer
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

english_sentences = [
    "hello", "how are you", "i am fine", "what is your name", "nice to meet you",
    "i love machine learning", "do you like pizza", "good morning", "thank you", "see you later"
]
french_sentences = [
    "bonjour", "comment ça va", "je vais bien", "quel est ton nom", "ravi de vous rencontrer",
    "j'aime l'apprentissage automatique", "aimes-tu la pizza", "bonjour", "merci", "à plus tard"
]

model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

results = []
for i, src in enumerate(english_sentences):
    inputs = tokenizer(src, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    tgt = tokenizer.decode(translated[0], skip_special_tokens=True)

    reference = [nltk.word_tokenize(french_sentences[i])]
    prediction = nltk.word_tokenize(tgt)
    bleu = sentence_bleu(reference, prediction)
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True).score(
        ' '.join(reference[0]), ' '.join(prediction))

    results.append({
        "English": src,
        "French (Reference)": french_sentences[i],
        "Predicted French": tgt,
        "BLEU": round(bleu, 4),
        "ROUGE-1": round(rouge['rouge1'].fmeasure, 4),
        "ROUGE-2": round(rouge['rouge2'].fmeasure, 4)
    })

import pandas as pd
pd.DataFrame(results)

# Using model from scratch with small datset

In [None]:
# Step 1: Install and import libraries
!pip install -q datasets nltk rouge-score

from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
nltk.download('punkt')

# Step 2: Prepare a small dataset (English-French)
english_sentences = [
    "hello", "how are you", "i am fine", "what is your name", "nice to meet you",
    "i love machine learning", "do you like pizza", "good morning", "thank you", "see you later"
]

french_sentences = [
    "bonjour", "comment ça va", "je vais bien", "quel est ton nom", "ravi de vous rencontrer",
    "j'aime l'apprentissage automatique", "aimes-tu la pizza", "bonjour", "merci", "à plus tard"
]

# Step 3: Tokenization
def tokenize(sentences, num_words=10000):
    tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(sentences)
    tensor = tokenizer.texts_to_sequences(sentences)
    return tokenizer, pad_sequences(tensor, padding='post')

en_tokenizer, en_tensor = tokenize(english_sentences)
fr_tokenizer, fr_tensor = tokenize(french_sentences)
input_vocab_size = len(en_tokenizer.word_index) + 1
target_vocab_size = len(fr_tokenizer.word_index) + 1

# Step 4: Define transformer model
class Transformer(tf.keras.Model):
    def __init__(self, input_vocab, target_vocab, d_model, num_heads, dff, pe_input, pe_target):
        super().__init__()
        self.encoder_embedding = tf.keras.layers.Embedding(input_vocab, d_model)
        self.decoder_embedding = tf.keras.layers.Embedding(target_vocab, d_model)
        self.pos_encoding_input = self.positional_encoding(pe_input, d_model)
        self.pos_encoding_target = self.positional_encoding(pe_target, d_model)

        self.enc_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.dec_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.dense_proj = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        self.final_dense = tf.keras.layers.Dense(target_vocab)

    def positional_encoding(self, max_len, dm):
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(dm)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(dm))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        return tf.cast(angle_rads, dtype=tf.float32)

    def call(self, inputs, training=False):
        inp, tar = inputs
        enc = self.encoder_embedding(inp) + self.pos_encoding_input[:tf.shape(inp)[1]]
        dec = self.decoder_embedding(tar) + self.pos_encoding_target[:tf.shape(tar)[1]]
        enc_output = self.enc_layer(enc, enc)
        dec_output = self.dec_layer(dec, enc_output)
        final = self.dense_proj(dec_output)
        return self.final_dense(final)

# Step 5: Compile and train
model = Transformer(input_vocab=input_vocab_size, target_vocab=target_vocab_size,
                    d_model=64, num_heads=4, dff=128, pe_input=en_tensor.shape[1], pe_target=fr_tensor.shape[1])
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit([en_tensor, fr_tensor[:, :-1]], fr_tensor[:, 1:], epochs=20, batch_size=32)

# Step 6: Translation function
def translate(sentence):
    seq = en_tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=en_tensor.shape[1], padding='post')
    decoder_input = np.zeros((1, fr_tensor.shape[1]), dtype=np.int32)
    decoder_input[0][0] = fr_tokenizer.word_index.get('<sos>', 1)

    for i in range(1, fr_tensor.shape[1]):
        output = model([padded, decoder_input[:, :-1]], training=False)
        pred_id = tf.argmax(output[0, i-1]).numpy()
        decoder_input[0][i] = pred_id
        if pred_id == fr_tokenizer.word_index.get('<eos>'):
            break

    return ' '.join([fr_tokenizer.index_word.get(idx, '') for idx in decoder_input[0] if idx != 0])

# Step 7: Evaluation
results = []
for i in range(10):
    ref = [nltk.word_tokenize(french_sentences[i].replace('<sos>', '').replace('<eos>', '').strip())]
    pred = nltk.word_tokenize(translate(english_sentences[i]))
    bleu = sentence_bleu(ref, pred)
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True).score(' '.join(ref[0]), ' '.join(pred))
    results.append((english_sentences[i], french_sentences[i].replace('<sos>', '').replace('<eos>', '').strip(), ' '.join(pred), bleu, rouge['rouge1'].fmeasure, rouge['rouge2'].fmeasure))

import pandas as pd
pd.DataFrame(results, columns=["English", "French (Reference)", "Predicted French", "BLEU", "ROUGE-1", "ROUGE-2"])

# Using model from scratch with big datset

In [None]:
# Step 1: Install and import libraries
!pip install -q datasets nltk rouge-score

from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
nltk.download('punkt')

# Step 2: Load a real translation dataset (English-French)
data = load_dataset("opus_books", "en-fr", split='train[:10000]')
english_sentences = [f"{x['translation']['en']}" for x in data]
french_sentences = [f"<sos> {x['translation']['fr']} <eos>" for x in data]

# Step 3: Tokenization
def tokenize(sentences, num_words=10000):
    tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(sentences)
    tensor = tokenizer.texts_to_sequences(sentences)
    return tokenizer, pad_sequences(tensor, padding='post')

en_tokenizer, en_tensor = tokenize(english_sentences)
fr_tokenizer, fr_tensor = tokenize(french_sentences)
input_vocab_size = len(en_tokenizer.word_index) + 1
target_vocab_size = len(fr_tokenizer.word_index) + 1

# Step 4: Define transformer model
class Transformer(tf.keras.Model):
    def __init__(self, input_vocab, target_vocab, d_model, num_heads, dff, pe_input, pe_target):
        super().__init__()
        self.encoder_embedding = tf.keras.layers.Embedding(input_vocab, d_model)
        self.decoder_embedding = tf.keras.layers.Embedding(target_vocab, d_model)
        self.pos_encoding_input = self.positional_encoding(pe_input, d_model)
        self.pos_encoding_target = self.positional_encoding(pe_target, d_model)

        self.enc_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.dec_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.dense_proj = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        self.final_dense = tf.keras.layers.Dense(target_vocab)

    def positional_encoding(self, max_len, dm):
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(dm)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(dm))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        return tf.cast(angle_rads, dtype=tf.float32)

    def call(self, inputs, training=False):
        inp, tar = inputs
        enc = self.encoder_embedding(inp) + self.pos_encoding_input[:tf.shape(inp)[1]]
        dec = self.decoder_embedding(tar) + self.pos_encoding_target[:tf.shape(tar)[1]]
        enc_output = self.enc_layer(enc, enc)
        dec_output = self.dec_layer(dec, enc_output)
        final = self.dense_proj(dec_output)
        return self.final_dense(final)

# Step 5: Compile and train
model = Transformer(input_vocab=input_vocab_size, target_vocab=target_vocab_size,
                    d_model=64, num_heads=4, dff=128, pe_input=en_tensor.shape[1], pe_target=fr_tensor.shape[1])
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit([en_tensor, fr_tensor[:, :-1]], fr_tensor[:, 1:], epochs=20, batch_size=32)

# Step 6: Translation function
def translate(sentence):
    seq = en_tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=en_tensor.shape[1], padding='post')
    decoder_input = np.zeros((1, fr_tensor.shape[1]), dtype=np.int32)
    decoder_input[0][0] = fr_tokenizer.word_index.get('<sos>', 1)

    for i in range(1, fr_tensor.shape[1]):
        output = model([padded, decoder_input[:, :-1]], training=False)
        pred_id = tf.argmax(output[0, i-1]).numpy()
        decoder_input[0][i] = pred_id
        if pred_id == fr_tokenizer.word_index.get('<eos>'):
            break

    return ' '.join([fr_tokenizer.index_word.get(idx, '') for idx in decoder_input[0] if idx != 0])

# Step 7: Evaluation
results = []
for i in range(10):
    ref = [nltk.word_tokenize(french_sentences[i].replace('<sos>', '').replace('<eos>', '').strip())]
    pred = nltk.word_tokenize(translate(english_sentences[i]))
    bleu = sentence_bleu(ref, pred)
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True).score(' '.join(ref[0]), ' '.join(pred))
    results.append((english_sentences[i], french_sentences[i].replace('<sos>', '').replace('<eos>', '').strip(), ' '.join(pred), bleu, rouge['rouge1'].fmeasure, rouge['rouge2'].fmeasure))

import pandas as pd
pd.DataFrame(results, columns=["English", "French (Reference)", "Predicted French", "BLEU", "ROUGE-1", "ROUGE-2"])

In [14]:
# Step 1: Install and import libraries
!pip install -q datasets nltk rouge-score

from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
nltk.download('punkt')

# Step 2: Prepare a small dataset (English-French)
english_sentences = [
    "hello", "how are you", "i am fine", "what is your name", "nice to meet you",
    "i love machine learning", "do you like pizza", "good morning", "thank you", "see you later"
]

french_sentences = [
    "bonjour", "comment ça va", "je vais bien", "quel est ton nom", "ravi de vous rencontrer",
    "j'aime l'apprentissage automatique", "aimes-tu la pizza", "bonjour", "merci", "à plus tard"
]

# Step 3: Tokenization
def tokenize(sentences, num_words=10000):
    tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(sentences)
    tensor = tokenizer.texts_to_sequences(sentences)
    return tokenizer, pad_sequences(tensor, padding='post')

en_tokenizer, en_tensor = tokenize(english_sentences)
fr_tokenizer, fr_tensor = tokenize(french_sentences)
input_vocab_size = len(en_tokenizer.word_index) + 1
target_vocab_size = len(fr_tokenizer.word_index) + 1

# Step 4: Define transformer model
class Transformer(tf.keras.Model):
    def __init__(self, input_vocab, target_vocab, d_model, num_heads, dff, pe_input, pe_target):
        super().__init__()
        self.encoder_embedding = tf.keras.layers.Embedding(input_vocab, d_model)
        self.decoder_embedding = tf.keras.layers.Embedding(target_vocab, d_model)
        self.pos_encoding_input = self.positional_encoding(pe_input, d_model)
        self.pos_encoding_target = self.positional_encoding(pe_target, d_model)

        self.enc_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.dec_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.dense_proj = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        self.final_dense = tf.keras.layers.Dense(target_vocab)

    def positional_encoding(self, max_len, dm):
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(dm)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(dm))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        return tf.cast(angle_rads, dtype=tf.float32)

    def call(self, inputs, training=False):
        inp, tar = inputs
        enc = self.encoder_embedding(inp) + self.pos_encoding_input[:tf.shape(inp)[1]]
        dec = self.decoder_embedding(tar) + self.pos_encoding_target[:tf.shape(tar)[1]]
        enc_output = self.enc_layer(enc, enc)
        dec_output = self.dec_layer(dec, enc_output)
        final = self.dense_proj(dec_output)
        return self.final_dense(final)

# Step 5: Compile and train
model = Transformer(input_vocab=input_vocab_size, target_vocab=target_vocab_size,
                    d_model=64, num_heads=4, dff=128, pe_input=en_tensor.shape[1], pe_target=fr_tensor.shape[1])
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit([en_tensor, fr_tensor[:, :-1]], fr_tensor[:, 1:], epochs=20, batch_size=32)

# Step 6: Translation function
def translate(sentence):
    seq = en_tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=en_tensor.shape[1], padding='post')
    decoder_input = np.zeros((1, fr_tensor.shape[1]), dtype=np.int32)
    decoder_input[0][0] = fr_tokenizer.word_index.get('<sos>', 1)

    for i in range(1, fr_tensor.shape[1]):
        output = model([padded, decoder_input[:, :-1]], training=False)
        pred_id = tf.argmax(output[0, i-1]).numpy()
        decoder_input[0][i] = pred_id
        if pred_id == fr_tokenizer.word_index.get('<eos>'):
            break

    return ' '.join([fr_tokenizer.index_word.get(idx, '') for idx in decoder_input[0] if idx != 0])

# Step 7: Evaluation
results = []
for i in range(10):
    ref = [nltk.word_tokenize(french_sentences[i].replace('<sos>', '').replace('<eos>', '').strip())]
    pred = nltk.word_tokenize(translate(english_sentences[i]))
    bleu = sentence_bleu(ref, pred)
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True).score(' '.join(ref[0]), ' '.join(pred))
    results.append((english_sentences[i], french_sentences[i].replace('<sos>', '').replace('<eos>', '').strip(), ' '.join(pred), bleu, rouge['rouge1'].fmeasure, rouge['rouge2'].fmeasure))

import pandas as pd
pd.DataFrame(results, columns=["English", "French (Reference)", "Predicted French", "BLEU", "ROUGE-1", "ROUGE-2"])

Epoch 1/20


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - accuracy: 0.0000e+00 - loss: 3.3420
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.4333 - loss: 3.2983
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.4333 - loss: 3.2217
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.4333 - loss: 3.1042
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.4333 - loss: 2.9288
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.4333 - loss: 2.7229
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.4333 - loss: 2.6827
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.4333 - loss: 2.7719
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61

Unnamed: 0,English,French (Reference),Predicted French,BLEU,ROUGE-1,ROUGE-2
0,hello,bonjour,< OOV >,0,0.0,0.0
1,how are you,comment ça va,< OOV >,0,0.0,0.0
2,i am fine,je vais bien,< OOV >,0,0.0,0.0
3,what is your name,quel est ton nom,< OOV >,0,0.0,0.0
4,nice to meet you,ravi de vous rencontrer,< OOV >,0,0.0,0.0
5,i love machine learning,j'aime l'apprentissage automatique,< OOV >,0,0.0,0.0
6,do you like pizza,aimes-tu la pizza,< OOV >,0,0.0,0.0
7,good morning,bonjour,< OOV >,0,0.0,0.0
8,thank you,merci,< OOV >,0,0.0,0.0
9,see you later,à plus tard,< OOV >,0,0.0,0.0
