## Neural Machine Translation(X to Y language)

### Data Loading and Preprocessing

1. Importing necessary libraries

In [None]:
import os
import random
import json
import pickle
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

2. Loading fra-eng dataset

In [None]:
with open('fra.txt', encoding='utf-8') as f:
    lines = f.read().strip().split('\n')
pairs = [line.split('\t')[:2] for line in lines if '\t' in line]
print(f"Loaded {len(pairs):,} sentence pairs.")

Loaded 237,838 sentence pairs.


3. Cleaning & Truncating

In [None]:
random.shuffle(pairs)
pairs = pairs[:200000]

eng_sentences = [p[0].lower() for p in pairs]
fra_sentences_input = ['<sos> ' + p[1].lower() for p in pairs]
fra_sentences_target = [p[1].lower() + ' <eos>' for p in pairs]

4. Tokenization

In [None]:
eng_tokenizer = Tokenizer(filters='', lower=True, oov_token='<unk>')
fra_tokenizer = Tokenizer(filters='', lower=True, oov_token='<unk>')

eng_tokenizer.fit_on_texts(eng_sentences)
fra_tokenizer.fit_on_texts(fra_sentences_input + fra_sentences_target)  # full vocab

eng_sequences = eng_tokenizer.texts_to_sequences(eng_sentences)
fra_input_sequences = fra_tokenizer.texts_to_sequences(fra_sentences_input)
fra_target_sequences = fra_tokenizer.texts_to_sequences(fra_sentences_target)

5. Padding

In [None]:
max_len_eng = max(len(seq) for seq in eng_sequences)
max_len_fra = max(max(len(seq) for seq in fra_input_sequences),
                  max(len(seq) for seq in fra_target_sequences))

encoder_input_data = pad_sequences(eng_sequences, maxlen=max_len_eng, padding='post')
decoder_input_data = pad_sequences(fra_input_sequences, maxlen=max_len_fra, padding='post')
decoder_target_data = pad_sequences(fra_target_sequences, maxlen=max_len_fra, padding='post')
decoder_target_data = decoder_target_data.reshape(*decoder_target_data.shape, 1)

6. Saving Tokenizers

In [None]:
with open('eng_tokenizer.pkl', 'wb') as f:
    pickle.dump(eng_tokenizer, f)
with open('fra_tokenizer.pkl', 'wb') as f:
    pickle.dump(fra_tokenizer, f)

config = {
    'eng_vocab_size': len(eng_tokenizer.word_index) + 1,
    'fra_vocab_size': len(fra_tokenizer.word_index) + 1,
    'max_len_eng': max_len_eng,
    'max_len_fra': max_len_fra,
}
with open('config.json', 'w') as f:
    json.dump(config, f)

np.savez_compressed(
    'data.npz',
    encoder_input_data=encoder_input_data,
    decoder_input_data=decoder_input_data,
    decoder_target_data=decoder_target_data
)

7. Embedding Matrix

In [None]:
EMBEDDING_DIM = 100

glove_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in tqdm(f, desc="Reading GloVe"):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove_index[word] = coefs

en_embedding_matrix = np.zeros((len(eng_tokenizer.word_index) + 1, EMBEDDING_DIM))
for word, i in eng_tokenizer.word_index.items():
    vector = glove_index.get(word)
    if vector is not None:
        en_embedding_matrix[i] = vector

with open('en_embedding_matrix.pkl', 'wb') as f:
    pickle.dump(en_embedding_matrix, f)

Reading GloVe: 400000it [00:09, 41631.83it/s]


7. Checking output

In [None]:
print("Sample English:", eng_sentences[0])
print("Tokenized:", eng_sequences[0])
print("Padded:", encoder_input_data[0])
print("Encoder Input Shape:", encoder_input_data.shape)
print("Decoder Input Shape:", decoder_input_data.shape)
print("Decoder Target Shape:", decoder_target_data.shape)

Sample English: you're my type.
Tokenized: [37, 18, 3444]
Padded: [  37   18 3444    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]
Encoder Input Shape: (200000, 55)
Decoder Input Shape: (200000, 57)
Decoder Target Shape: (200000, 57, 1)
