In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import os
import tensorflow_datasets

In [10]:
#导入数据
examples, info = tensorflow_datasets.load('ted_hrlr_translate/pt_to_en', with_info=True,as_supervised=True)
train_data, val_data = examples['train'], examples['validation']
print(info)



tfds.core.DatasetInfo(
    name='ted_hrlr_translate',
    version=0.0.1,
    description='Data sets derived from TED talk transcripts for comparing similar language pairs
where one is high resource and the other is low resource.
',
    urls=['https://github.com/neulab/word-embeddings-for-nmt'],
    features=Translation({
        'en': Text(shape=(), dtype=tf.string),
        'pt': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=54781,
    splits={
        'test': 1803,
        'train': 51785,
        'validation': 1193,
    },
    supervised_keys=('pt', 'en'),
    citation="""@inproceedings{Ye2018WordEmbeddings,
      author  = {Ye, Qi and Devendra, Sachan and Matthieu, Felix and Sarguna, Padmanabhan and Graham, Neubig},
      title   = {When and Why are pre-trained word embeddings useful for Neural Machine Translation},
      booktitle = {HLT-NAACL},
      year    = {2018},
      }""",
    redistribution_info=,
)



In [13]:
en_tokenizer = tensorflow_datasets.features.text.SubwordTextEncoder.build_from_corpus((en.numpy() for en,pt in train_data),target_vocab_size= 2**13)
pt_tokenizer = tensorflow_datasets.features.text.SubwordTextEncoder.build_from_corpus((pt.numpy() for en,pt in train_data),target_vocab_size= 2**13)


In [24]:
#数据预处理
buffer_size = 20000
batch_size = 64
max_length = 40

def encode_to_subword(en_sentence, pt_sentence):#词汇表里有0 ~ vacab_size-1的序号，所以用vacab_size和vocab_size+1作为开始和结束的标识符
    en_sequence =[en_tokenizer.vocab_size]\
    +en_tokenizer.encode(en_sentence.numpy())\
    +[en_tokenizer.vocab_size+1]
    
    pt_sequence =[pt_tokenizer.vocab_size]\
    +pt_tokenizer.encode(pt_sentence.numpy())\
    +[pt_tokenizer.vocab_size+1]
    
    return en_sequence, pt_sequence

def filter_by_max_length(pt, en):    #判断长度是否<=40
    return tf.logical_and(tf.size(pt) <=max_length, tf.size(en) <=max_length)
    
def tf_encode_to_subword(en_sentence, pt_sentence):    #进行pyfunction的封装
    return tf.py_function(encode_to_subword,[en_sentence, pt_sentence], [tf.int64, tf.int64])
    
train_dataset = train_data.map(tf_encode_to_subword)
train_dataset = train_dataset.filter(filter_by_max_length)     #舍弃长度大于40的数据
train_dataset = train_dataset.shuffle(buffer_size).padded_batch(batch_size,padded_shapes= ([-1], [-1]))    #随机打乱数据， 将每64个样本补成相同长度

val_dataset = val_data.map(tf_encode_to_subword)
val_dataset = val_dataset.filter(filter_by_max_length)     #舍弃长度大于40的数据
val_dataset = val_dataset.padded_batch(batch_size, padded_shapes= ([-1], [-1]))    #将每64个样本补成相同长度

In [25]:
for en,pt in val_dataset.take(5):
    print(en.shape,pt.shape)

(64, 40) (64, 40)
(64, 38) (64, 40)
(64, 40) (64, 40)
(64, 39) (64, 39)
(64, 37) (64, 38)
