In [15]:
import shutil
import os
import xml.etree.ElementTree as ET
import warnings
warnings.filterwarnings('ignore')
import random
import nagisa
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

kaggle dataset link: https://www.kaggle.com/datasets/team-ai/japaneseenglish-bilingual-corpus?resource=download  
This particular dataset is translated by a human and even checked.

In [2]:
# code to extract the zip file
data_folder_name=r'D:\chronicles\eng-jap-data'
target_dir=r'D:\chronicles\archive.zip'
shutil.unpack_archive(target_dir,data_folder_name)

This praticular dataset is stored in the form of XML.  
Where <j> tag has the japanese sentence and <e> has the english sentence.  
I have only considerd the english sentences which is checked rather than sentences which is just translated to english.

In [3]:
os.remove('D:\chronicles\eng-jap-data\wiki_corpus_2.01\GNM\GNM00155.xml')

In the below cell i am extracting the value of <j> and <e> tags and storing it in CSV file using pandas.

In [4]:
directory='D:\chronicles\eng-jap-data\wiki_corpus_2.01'
for item in os.listdir(directory):
    item=os.path.join(directory,item)
    print("In {} folder".format(item))
    if os.path.isdir(item):
        for xml_file in os.listdir(item):
            xml_file_path=os.path.join(item,xml_file)
            tree=ET.parse(xml_file_path)
            root=tree.getroot()
            for sen in root.findall("tit"):
                jap_sentence=sen.find("./j").text
                eng_sentence=sen.find("./e/[@type='check']").text
                with open('eng-to-jap','a',encoding="utf-8") as f:
                    f.write(str(eng_sentence))
                    f.write("\t")
                    f.write(jap_sentence)
                    f.write("\n")
print("Extracting done!")

In D:\chronicles\eng-jap-data\wiki_corpus_2.01\BDS folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\BLD folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\CLT folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\EPR folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\FML folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\GNM folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\HST folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\kyoto_lexicon.csv folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\LTT folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\PNM folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\readme.pdf folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\RLW folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\ROD folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\SAT folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\SCL folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\SNT folder
In D:\chronicles\eng-jap-data\wiki_corpus_2.01\TTL 

In [5]:
with open('eng-to-jap',encoding="utf-8") as f:
    lines=f.read().split("\n")[:-1]
    text_pairs=[]
    for line in lines:
        eng, jap = line.split("\t")
        jap= "[start] "+jap+" [end]"
        text_pairs.append((eng,jap))
    

In [6]:
for _ in range(3):
    print(random.choice(text_pairs))

('Zoroku HAMAMURA, the first', '[start] 浜村蔵六 (初世) [end]')
('Kyoto Prefectural Insho-Domoto Museum of Fine Arts', '[start] 京都府立堂本印象美術館 [end]')
('MINAMOTO no Noriyori', '[start] 源範頼 [end]')


In [7]:
random.shuffle(text_pairs)
num_val_samples=int(0.15*len(text_pairs))
num_train_samples=len(text_pairs)-2*num_val_samples
train_pairs=text_pairs[:num_train_samples]
val_pairs=text_pairs[num_train_samples:num_train_samples+num_val_samples]
test_pairs=text_pairs[num_train_samples+num_val_samples:]


print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

28220 total pairs
19754 training pairs
4233 validation pairs
4233 test pairs


In [12]:
vocab_size=15000
sequence_length=10
batch_size=64

def custom_standardization(input_string):
    return nagisa.filter(input_string)

eng_vectorization=TextVectorization(
                  max_tokens=vocab_size,
                  output_mode="int",
                  output_sequence_length=sequence_length)
jap_vectorization=TextVectorization(
                  max_tokens=vocab_size,
                  output_mode="int",
                  output_sequence_length=sequence_length+1,
#                   standardize=custom_standardization
                )

train_eng_texts=[pair[0] for pair in train_pairs]
train_jap_texts=[pair[1] for pair in train_pairs]

eng_vectorization.adapt(train_eng_texts)
jap_vectorization.adapt(train_jap_texts)

In [16]:
def format_datasets(eng,jap):
    eng=eng_vectorization(eng)
    jap=jap_vectorization(jap)
    return ({"encoder_inputes":eng,"decoder_inputs":jap[:,:-1],},jap[:,1:])

def make_datasets(pairs):
    eng_texts, jap_texts = zip(*pairs)
    eng_texts= list(eng_texts)
    jap_texts=list(jap_texts)
    dataset=tf.data.Dataset.from_tensor_slices((eng_texts,jap_texts))
    dataset=dataset.batch(batch_size)
    dataset=dataset.map(format_datasets)
    return dataset.shuffle(2048).prefetch(16).cache()

train_dataset=make_datasets(train_pairs)
val_dataset=make_datasets(val_pairs)