In [1]:
from d2l import torch as d2l
import collections
import re

导入一本书的数据集并且转化为一系列的文本

In [2]:
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt','090b5e7e70c295757f55df93cb0a180b9691891a')

def read_book():
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_book()
print(lines[0])
print(lines[10])

the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


词元化：tokenize函数将文本行列表（lines）作为输入，此列表中的元素为一个个文本序列，tokenize函数将每个文本序列拆开成为一个个词元（token）,词元是文本的基本单位，最后函数会返回一个由词元构成的列表（list）。

In [3]:
def tokenize(lines, token='word'):
    if (token == 'word'):
        return [line.split() for line in lines]
    elif (token == 'char'):
        return [list(line) for line in lines]
    else:
        print ('Error Token Type:' + token)

tokens = tokenize(lines)
for i in range(22):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
['fire', 'burned', 'brightly', 'and', 'the', 'soft', 'radiance', 'of', 'the', 'incandescent']
['lights', 'in', 'the', 'lilies', 'of', 'silver', 'caught', 'the', 'bubbles', 'that', 'flashed', 'and']
['passed', 'in', 'our', 'glasses', 'our', 'chairs', 'being', 'his', 'patents', 'embraced', 'and']
['caressed', 'us', 'rather', 'than', 'submitted', 'to', 'be', 'sat', 'upon', 'and', 'there', 'was', 'that']
['luxurious', 'after', 'dinner', 'atmosphere', 'when', 'thought', 'roams', 'gracefully']
['free', 'of', 'the', 'trammels', 'of', 'precision', 'and', 'he', 'put', 'it', 'to', 'us', 'in', 'this']
['way', 'marking', 'the

构建词汇表类：词元的类型为字符串，而模型需要的输入为数字，因此单纯的词元并不适合输入模型进行训练，需要将词元映射到从0开始的数字索引当中。首先需要先将所有文本合并到一起，接着对每个唯一的词元的出现频率进行统计，统计结果被称为语料库（corpus），然后为每个唯一词元的出现频率分配一个数字索引。很少出现的词元将被删除以降低复杂性。并且对于不存在语料库中的词元或者已经删除的词元都将被映射到一个未知词元<unk>中。通常地，可以人为地增加一个列表，用于保存那些被保留的词元，例如序列开始次元<bos>表示一个句子的开始,序列结束词元<eos>表示一个句子的结束。

In [18]:
class Vocab:
    def __init__(self, tokens=None, mini_freq=0, reserved_token=None):
        """文本词汇表"""
        if(tokens is None):
            tokens = [ ]
        if(reserved_token is None):
            reserved_token = [ ]
        counter = corpus_counter(tokens) #计算词元频率构造语料库
        self.token_freq = sorted(counter.items(), key=lambda x:x[1], reverse=True) #将词元频率按照出现频率从高到低排列
        
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_token #构造一个能够存放词元的字典
        #对于语料库中出现频率满足设定的最小频率的词元以及不在字典中的词元，逐个将这些满足条件的词元放入字典中。
        uniq_tokens += [token for token, freq in self.token_freq if freq >= mini_freq and token not in uniq_tokens] 
        self.token_to_idx = dict() #给定词元返回数字索引
        self.idx_to_token = [ ] #给定数字索引返回词元
        #将数字索引和字典中的词元一一对应
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1
        
    def __len__(self):
        """返回储存词元字典的长度"""
        return len(self.idx_to_token) 
        
    def __getitem__(self, tokens):
        """输入一个词元，返回一个数字索引"""
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
        
    def to_token(self, indices):
        """输入一个数字索引，返回一个词元"""
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token.get[indices]
        return [self.to_token[idx] for idx in indices]
               
def corpus_counter(tokens):
    """统计词频"""
    if (len(tokens)==0 or isinstance(tokens[0], list)): 
        """将词元映射到数字索引中以统计词元的出现频率"""
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [14]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [16]:
vocab

<__main__.Vocab at 0x17b93959b50>

将文本行转为数字索引列表

In [15]:
for i in [0, 10]:
    print('word:',tokens[i])
    print('index:',vocab[tokens[i]])

word: ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
index: [1, 19, 50, 40, 2183, 2184, 400]
word: ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
index: [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]


In [20]:
def load_corpus_time_machine(max_token=-1):
    lines = read_book()
    tokens = tokenize(lines, 'char')
    vocabu = Vocab(tokens)
    corpus = [vocabu[token] for line in tokens for token in line]
    
    if (max_token > 0):
        corpus = corpus[:max_token]
    return vocabu, corpus

vocabu, corpus = load_corpus_time_machine()
len(vocabu), len(corpus)

(28, 170580)