In [1]:
import tensorflow as tf

# Gathering data using TensorFlow's utility function
text_file = tf.keras.utils.get_file(
    fname='fra-eng.zip',
    origin="http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip",
    extract=True
)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip
[1m3423204/3423204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [2]:
# Importing necessary libraries
import pathlib
import unicodedata
import re

# Defining the path to the text file
text_file = pathlib.Path(text_file).parent / 'fra.txt'

def normalize(line):
    # Normalize unicode characters, strip leading/trailing whitespace, convert to lowercase
    line = unicodedata.normalize("NFKC", line.strip().lower())
    # Handle special characters and add start and end tokens for the target language (French)
    line = re.sub(r"^([^ \w])(?!\s)", r"\1", line)
    line = re.sub(r"(\s[^ \w])(?!\s)", r"\1", line)
    line = re.sub(r"(?!\s)([^ \w])$", r"\1", line)
    line = re.sub(r"(?!\s)([^ \w]\s)", r"\1", line)
    eng, fre = line.split("\t")
    fre = '[start] ' + fre + ' [end]'
    return eng, fre

# Read and normalize the text pairs
with open(text_file) as fp:
    text_pairs = [normalize(line) for line in fp]


In [3]:
# Tokenization and Statistics

# Initialize sets to store unique tokens for English and French
eng_tokens, fre_tokens = set(), set()
# Initialize variables to store maximum sequence lengths
eng_maxlen, fre_maxlen = 0, 0

# Iterate through text pairs to tokenize and compute statistics
for eng, fre in text_pairs:
    eng_token, fre_token = eng.split(), fre.split()
    eng_maxlen = max(eng_maxlen, len(eng_token))
    fre_maxlen = max(fre_maxlen, len(fre_token))
    eng_tokens.update(eng_token)
    fre_tokens.update(fre_token)

# Print statistics
print(f"Total tokens in English: {len(eng_tokens)}")
print(f"Total tokens in French: {len(fre_tokens)}")
print(f"Maximum length of English sequence: {eng_maxlen}")
print(f"Maximum length of French sequence: {fre_maxlen}")


Total tokens in English: 25365
Total tokens in French: 42027
Maximum length of English sequence: 47
Maximum length of French sequence: 56


In [4]:
import pickle

# Serialize preprocessed data for future use
with open("text_pairs.pickle", 'wb') as fp:
    pickle.dump(text_pairs, fp)
