In [1]:
"""
* This is the first notebook in a series for my sequence-to-sequence Neural Machine Translation (NMT) project.
* The primary goal of this particular notebook is to explore the data and perform preprocessing.
* The dataset I used was obtained from ManyThings.org (Tatoeba Project). It can be downloaded on https://www.manythings.org/corpus/.
* To run this notebook, download the data provided with my post, upload it to your Google Drive, and modify the path in the Main function.
* The next notebook in this series will be "2.NMT with simple LSTM".
"""

# Import all necessary packages for this notebook
import os
import re
import numpy as np
import unicodedata
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Mount Google Drive
def mount_drive():
  from google.colab import drive
  drive.mount('/content/gdrive')

# Load raw parallel data
def load_raw_data(filepath):
  with open(filepath, 'r', encoding='utf8') as f:
      data = f.readlines()
  return data

# English preprocessing
def preprocess_english(sentence):
  sentence = sentence.lower().strip() # just keep everything in lower case
  sentence = unicodedata.normalize('NFC', sentence) # standardize characters
  sentence = re.sub(r"([?.!,])", r" \1 ", sentence) # add spaces around punctuations
  sentence = re.sub(r"[^a-z0-9?.!,'-]+", " ", sentence) # keep only valid elements
  sentence = re.sub(r'[" "]+', " ", sentence) # remove extra spaces
  return sentence.strip()

# Japanese preprocessing
def preprocess_japanese(sentence):
  sentence = sentence.strip()
  sentence = unicodedata.normalize("NFKC", sentence)
  sentence = re.sub(r"([?.!,?。!、])", r" \1 ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)
  return sentence.strip()

# Tokenize and pad English data
def tokenize_and_pad_english(sequences):
  """
  Process the English sequences by adding <start> and <end> tags and tokenizing them with paddings.
  Returns:
    tagged (list of str): sentences with <start> and <end> tags added.
    padded (np.ndarray): tokenized and padded sequences.
    tokenzier (Tokenizer): tensorflow tokenizer fitted to the data.
    max_len (int): maximum sequence length after tokenization.
  """
  tagged = [f"<start> {s} <end>" for s in sequences] # add indicator tags to the sentences
  tokenizer = Tokenizer(split=' ', char_level=False) # get tokenizer
  tokenizer.fit_on_texts(tagged) # fit tokenzier on the corpus
  tokenized = tokenizer.texts_to_sequences(tagged) # tokenize the data
  max_len = max(len(seq) for seq in tokenized) # get maximum sequence length out of all data
  padded = pad_sequences(tokenized, maxlen=max_len, padding='post') # add paddings to ensure all sequences have the same length
  return tagged, padded, tokenizer, max_len

# Tokenize and pad Japanese data with MeCab
def tokenize_and_pad_japanese(sequences):
  """
  Process the Japanese sequences by breaking them down into the smallest meaningful Japanese units. Add tags and tokenize the data with paddings.
  Returns:
    tagged (list of str): sentences with <start> and <end> tags added.
    padded (np.ndarray): tokenized and padded sequences.
    tokenzier (Tokenizer): tensorflow tokenizer fitted to the data.
    max_len (int): maximum sequence length after tokenization.
  """
  import MeCab # use MeCab to break down the Japanese characters into the smallest meaningful units (a.k.a. Morphological Units in Linguistics)
  mecab = MeCab.Tagger("-Owakati") # get MeCab to return wakati (morphological japanese separated by space)
  wakati = [mecab.parse(s).strip() for s in sequences] # apply MeCab
  tagged = [f"<start> {s} <end>" for s in wakati]
  tokenizer = Tokenizer(split=' ', char_level=False)
  tokenizer.fit_on_texts(tagged)
  tokenized = tokenizer.texts_to_sequences(tagged)
  max_len = max(len(seq) for seq in tokenized)
  padded = pad_sequences(tokenized, maxlen=max_len, padding='post')
  return tagged, padded, tokenizer, max_len

# Save data and tokenizers
def save_preprocessed_data(padded_en, padded_ja, en_tok, ja_tok, save_dir):
  os.makedirs(save_dir, exist_ok=True)
  np.save(os.path.join(save_dir, 'padded_english.npy'), padded_en) # save the padded sequences
  np.save(os.path.join(save_dir, 'padded_japanese.npy'), padded_ja)
  # save the tokenizers using Pickle
  with open(os.path.join(save_dir, 'english_tokenizer.pkl'), 'wb') as f:
      pickle.dump(en_tok, f)
  with open(os.path.join(save_dir, 'japanese_tokenizer.pkl'), 'wb') as f:
      pickle.dump(ja_tok, f)

def main():
  # Intro
  print("="*80)
  print("Running Notebook No.1 for Data Prep.")
  print("="*80)

  # Mount Google Drive
  print("\n1. Mounting Google Drive...")
  mount_drive()

  # Access the data and print an example
  path = '/content/gdrive/My Drive/NMT_Data/jpn.txt'
  print(f"\n2. Reading data from {path}...")
  data = load_raw_data(path)
  print(f"Done. Loaded {len(data)} rows of data.")
  print(f'An example of the data: {data[0]}')

  # Split and preprocess
  print("3. Preocessing the data...")
  english_raw = []
  japanese_raw = []
  for line in data:
      en, ja = line.split('\t') # the sequences are split by tabs
      english_raw.append(preprocess_english(en))
      japanese_raw.append(preprocess_japanese(ja))
  print("Done.")

  # Add <start> and <end> tags, tokenize, and pad English
  print("\n4. Tagging, tokenizing, and padding the English data...")
  tagged_en, padded_en, en_tok, max_len_en = tokenize_and_pad_english(english_raw)
  print(f"""
  Original Sentence: {english_raw[0]}
  Tagged Sentence: {tagged_en[0]}
  Padded Sentence: {padded_en[0]}
  Max Length: {max_len_en}
  Vocab Size: {len(en_tok.word_index)}
  """)

  # Install MeCab
  print("\n5. Installing MeCab for the Japanese data...")
  os.system('apt install -y mecab mecab-ipadic-utf8')
  os.system('pip install mecab-python3 unidic-lite')
  print("Done.")

  # Add <start> and <end> tags, tokenize, and pad Japanese
  print("\n6. Tagging, tokenizing, and padding the Japanese data...")
  tagged_ja, padded_ja, ja_tok, max_len_ja = tokenize_and_pad_japanese(japanese_raw)
  print(f"""
  Original Sentence: {japanese_raw[0]}
  Tagged Sentence: {tagged_ja[0]}
  Padded Sentence: {padded_ja[0]}
  Max Length: {max_len_ja}
  Vocab Size: {len(ja_tok.word_index)}
  """)

  # Save to files
  print("\n7. Saving Data and Tokenizers to Google Dirve...")
  save_dir = os.path.dirname(path)
  save_preprocessed_data(padded_en, padded_ja, en_tok, ja_tok, save_dir)
  print("Done.")

  # Indicate the end
  print("\n" + "=" * 80)
  print("You've hit the end of this notebook. Go to the next one and start building LSTM with me. 😊")
  print("="*80)

if __name__ == '__main__':
  main()

Running Notebook No.1 for Data Prep.

1. Mounting Google Drive...
Mounted at /content/gdrive

2. Reading data from /content/gdrive/My Drive/NMT_Data/jpn.txt...
Done. Loaded 178348 rows of data.
An example of the data: ﻿Let's try something.	何かしてみましょう。

3. Preocessing the data...
Done.

4. Tagging, tokenizing, and padding the English data...

  Original Sentence: let's try something .
  Tagged Sentence: <start> let's try something . <end>
  Padded Sentence: [  1 157 256 171   2   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0]
  Max Length: 57
  Vocab Size: 15650
  

5. Installing MeCab for the Japanese data...
Done.

6. Tagging, tokenizing, and padding the Japanese data...

  Original Sentence: 何かしてみましょう 。
  Tagged Sentence: <start> 何 か し て み ましょう 。 <end>
  Padded Sentence: [  1  35  18  12   9 204 186   3   2   0   0   0   0  