In [100]:
import requests
import pandas as pd
from tqdm import tqdm
import numpy as np

In [103]:
def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in tqdm(r.iter_content(chunk_size=8192)): 
                f.write(chunk)
url = "https://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz"
local_filename = "../data/kftt-data-1.0.tar.gz"
#download_file(url, local_filename)
#!tar -xvzf ../data/kftt-data-1.0.tar.gz

12116it [00:27, 437.59it/s]


In [107]:
folder_path = '../data/kftt-data-1.0/data/tok/'

train_file_ja_path = folder_path + 'kyoto-dev.ja'
train_file_en_path = folder_path + 'kyoto-dev.en'

In [108]:
def tokenizer_ja(text):
    return text.strip().split()

def tokenizer_en(text):
    return text.strip().lower().split()

def text_iterator_ja(file_stream):
    for line in file_stream:
        yield line

def text_iterator_en(file_stream):
    for line in file_stream:
        yield '<bos>'+' '+line+' '+'<eos>'

def text_iterator_vocab(file_stream,tokenizer):
    for line in file_stream:
        yield tokenizer(line)

In [109]:
from sklearn.model_selection import train_test_split
text_df = pd.DataFrame()
with open(train_file_ja_path,"r") as f:
  sentences_ja = list(text_iterator_ja(f))
  sentences_ja = np.array(sentences_ja)
with open(train_file_en_path,"r") as f:
  sentences_en = list(text_iterator_en(f))
  sentences_en = np.array(sentences_en)
text_df['text_en'] = sentences_en
text_df['text_ja'] = sentences_ja

In [110]:
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import FastText
import torch

In [111]:
with open(train_file_ja_path, "r") as f:
    vocab_ja = torchtext.vocab.build_vocab_from_iterator(
        iterator=text_iterator_vocab(f, tokenizer_ja),
        specials = (['<pad>','<unk>','<eos>','<bos>'])
    )

with open(train_file_en_path, "r") as f:
    vocab_en = torchtext.vocab.build_vocab_from_iterator(
        iterator=text_iterator_vocab(f, tokenizer_en),
        specials = (['<pad>','<unk>','<eos>','<bos>'])
    )

In [112]:
vocab_ja.set_default_index(vocab_ja['<pad>'])
vocab_en.set_default_index(vocab_en['<pad>'])

vocab_ja.set_default_index(vocab_ja['<unk>'])
vocab_en.set_default_index(vocab_en['<unk>'])

vocab_ja.set_default_index(vocab_ja['<bos>'])
vocab_en.set_default_index(vocab_en['<bos>'])

vocab_ja.set_default_index(vocab_ja['<eos>'])
vocab_en.set_default_index(vocab_en['<eos>'])

In [113]:
sentences_id_ja = [vocab_ja.lookup_indices(tokenizer_ja(text_list)) for text_list in sentences_ja]
sentences_id_en = [vocab_en.lookup_indices(tokenizer_en(text_list)) for text_list in sentences_en]

In [114]:
max_length_ja = np.max(np.array([len(item) for item in sentences_id_ja]))
max_length_en = np.max(np.array([len(item) for item in sentences_id_en]))

In [115]:
from torch.utils.data import Dataset,DataLoader
class MyDataset(Dataset):
  def __init__(self,data,vocab_en,vocab_ja):
    self.data = data
    self.sentences_ja = list(data['text_ja'])
    self.sentences_en = list(data['text_en'])
    self.vocab_en = vocab_en
    self.vocab_ja = vocab_ja
    self.vocab_size_ja = len(vocab_ja)
    self.vocab_size_en = len(vocab_en)
  def __len__(self):
    return len(self.data)
  def tokenizer_ja(self,text):
    return text.strip().split()
  def tokenizer_en(self,text):
    return text.strip().lower().split()
  def text_iterator_ja(self,file_stream):
    for line in file_stream:
        yield line
  def text_iterator_en(self,file_stream):
    for line in file_stream:
        yield '<bos>'+' '+line+' '+'<eos>'
  def text_iterator_vocab(self,file_stream,tokenizer):
    for line in file_stream:
        yield tokenizer(line)
        
  def __getitem__(self,idx):
    tokenized_sentence_ja = self.text_iterator_vocab(self.sentences_ja,tokenizer_ja)
    tokenized_sentence_en = self.text_iterator_vocab(self.sentences_en,tokenizer_en)
    tokenized_sentence_ja = list(tokenized_sentence_ja)[idx]
    tokenized_sentence_en = list(tokenized_sentence_en)[idx]
    while(len(tokenized_sentence_ja) < MAX_LEN):
      tokenized_sentence_ja.append('<pad>')
    while(len(tokenized_sentence_en) < MAX_LEN):
      tokenized_sentence_en.append('<pad>')
    sentence_id_ja = self.vocab_ja.lookup_indices(tokenized_sentence_ja)
    sentence_id_en = self.vocab_en.lookup_indices(tokenized_sentence_en)
    sentence_id_ja = torch.tensor(sentence_id_ja,dtype=torch.long)
    sentence_id_en = torch.tensor(sentence_id_en,dtype=torch.long)
    return sentence_id_ja,sentence_id_en
train_data,test_data = train_test_split(text_df,test_size=0.2)
train_data = MyDataset(train_data,vocab_en,vocab_ja)
test_data = MyDataset(test_data,vocab_en,vocab_ja)

In [116]:
torch.save(vocab_ja, 'vocab_ja.pth')
torch.save(vocab_en, 'vocab_en.pth')

In [117]:
torch.save(train_data,'../data/train_data.pth')
torch.save(test_data,'../data/test_data.pth')

In [97]:
MAX_LEN = np.max([max_length_ja,max_length_en])
src_vocab_size = len(vocab_ja)
tgt_vocab_size = len(vocab_en)

In [81]:
import json

config_data = {
    'max_len':int(np.max([max_length_ja,max_length_en])),
    'src_vocab_size':int(src_vocab_size),
    'tgt_vocab_size':int(tgt_vocab_size),
}

with open('../data/config.json','w') as json_file:
    json.dump(config_data, json_file, indent=4)

In [118]:
train_loader = DataLoader(train_data,batch_size=16,shuffle=True,drop_last=True)
test_loader = DataLoader(test_data,batch_size=16,shuffle=True,drop_last=True)

In [119]:
tmp = iter(train_loader)
ja_text = 0
en_text = 0
for batch in tmp:
  ja_text,en_text = batch
  print("文章の量は{}".format(len(en_text)))
  print("en_textのshapeは{}".format(en_text.shape))
  print("最初のencoding文は{}".format(en_text[10][0:30]))
  print("辞書の最初の30文字は{}".format(vocab_en.lookup_tokens(range(30))))
  print("文の最大長さは{}".format(len(en_text[10])))
  print("------------------------------------------------")
  print("文章の量は{}".format(len(ja_text)))
  print("ja_textのshapeは{}".format(ja_text.shape))
  print("最初のencoding文は{}".format(ja_text[10][0:30]))
  print("辞書の最初の30文字は{}".format(vocab_ja.lookup_tokens(range(30))))
  print("文の最大長さは{}".format(len(ja_text[10])))
  print("------------------------------------------------")
  break

文章の量は16
en_textのshapeはtorch.Size([16, 159])
最初のencoding文はtensor([   3,  101,    6, 2623,    8,  950,    2,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0])
辞書の最初の30文字は['<pad>', '<unk>', '<eos>', '<bos>', 'the', ',', 'of', '.', 'and', 'in', '(', ')', 'to', 'was', 'a', '"', 'is', 'as', "'s", 'that', 'by', 'kyoto', 'for', 'it', 'his', 'university', 'with', 'he', 'emperor', '-']
文の最大長さは159
------------------------------------------------
文章の量は16
ja_textのshapeはtorch.Size([16, 159])
最初のencoding文はtensor([4458,  501,  190,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0])
辞書の最初の30文字は['<pad>', '<unk>', '<eos>', '<bos>', 'の', '、', 'に', '。', 'は', 'を', 'る', 'た', 'て', 'と', 'し', '（', '）', 'が', 'い', '年', 'で', 'な', 'あ', 'っ', 'れ', '・', 'さ', 'り', '-', '京都']
文の最大長さは159
----