In [10]:
import torch
from torch import nn
import numpy as np
import json

In [11]:
def tokenizer_ja(text):
    return text.strip().split()

def tokenizer_en(text):
    return text.strip().lower().split()

def text_iterator_ja(file_stream):
    for line in file_stream:
        yield line

def text_iterator_en(file_stream):
    for line in file_stream:
        yield '<bos>'+' '+line+' '+'<eos>'

def text_iterator_vocab(file_stream,tokenizer):
    for line in file_stream:
        yield tokenizer(line)

In [14]:
from torch.utils.data import Dataset,DataLoader
class MyDataset(Dataset):
  def __init__(self,data,vocab_en,vocab_ja):
    self.data = data
    self.sentences_ja = list(data['text_ja'])
    self.sentences_en = list(data['text_en'])
    self.vocab_en = vocab_en
    self.vocab_ja = vocab_ja
    self.vocab_size_ja = len(vocab_ja)
    self.vocab_size_en = len(vocab_en)
  def __len__(self):
    return len(self.data)
  def tokenizer_ja(self,text):
    return text.strip().split()
  def tokenizer_en(self,text):
    return text.strip().lower().split()
  def text_iterator_ja(self,file_stream):
    for line in file_stream:
        yield line
  def text_iterator_en(self,file_stream):
    for line in file_stream:
        yield '<bos>'+' '+line+' '+'<eos>'
  def text_iterator_vocab(self,file_stream,tokenizer):
    for line in file_stream:
        yield tokenizer(line)
  def get_max_len(self):
      with open('../data/config.json', 'r') as file:
        config = json.load(file)
      max_len = config["max_len"]
      return max_len
  def __getitem__(self,idx):
    tokenized_sentence_ja = self.text_iterator_vocab(self.sentences_ja,tokenizer_ja)
    tokenized_sentence_en = self.text_iterator_vocab(self.sentences_en,tokenizer_en)
    tokenized_sentence_ja = list(tokenized_sentence_ja)[idx]
    tokenized_sentence_en = list(tokenized_sentence_en)[idx]
    while(len(tokenized_sentence_ja) < self.get_max_len()):
      tokenized_sentence_ja.append('<pad>')
    while(len(tokenized_sentence_en) < self.get_max_len()):
      tokenized_sentence_en.append('<pad>')
    sentence_id_ja = self.vocab_ja.lookup_indices(tokenized_sentence_ja)
    sentence_id_en = self.vocab_en.lookup_indices(tokenized_sentence_en)
    sentence_id_ja = torch.tensor(sentence_id_ja,dtype=torch.long)
    sentence_id_en = torch.tensor(sentence_id_en,dtype=torch.long)
    return sentence_id_ja,sentence_id_en