### 文本预处理

In [46]:
# 将文本作为字符串加载到内存
# 将字符串拆分为词元（如单词和字符）
# 建立一个词汇表，将拆分的词元映射到数字索引
# 将文本转换为数字索引序列，方便模型操作

#### 1、读取数据集

In [47]:
import collections
import re

In [48]:
def read_time_mechaine():
    with open('./The Echo of a Dying Star.txt','r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+',' ',line).strip().lower() for line in lines]

In [49]:
lines = read_time_mechaine()
print(f'# text lines:{len(lines)}' )
print(lines[0])
print(lines[10])

# text lines:71
title the echo of a dying star
initiate the sequence aris s voice was calm belying the tremor in her hands on the viewscreen the asteroid hangar bay doors retracted revealing the inky blackness dotted with pinpricks of light


#### 2、词元化

In [50]:
def tokenize(lines,token='word'):
    if token == 'word':
        return [line.split() for line in lines]
    elif token=='char':
        return [list(line) for line in lines]
    else:
        print('错误，未知词元类型:'+token)

In [51]:
tokens = tokenize(lines,token='char')
for i in range(2):
    print(tokens[i])

['t', 'i', 't', 'l', 'e', ' ', 't', 'h', 'e', ' ', 'e', 'c', 'h', 'o', ' ', 'o', 'f', ' ', 'a', ' ', 'd', 'y', 'i', 'n', 'g', ' ', 's', 't', 'a', 'r']
[]


#### 3、构建词汇表

In [52]:
#统计词元频率
def count_conpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0],list):
        #把词元列表展平成使用词元填充的一个列表
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)


In [53]:
len(count_conpus(tokens))

27

In [54]:
#python自带排序方法
sorted(count_conpus(tokens).items(),key=lambda x: x[1],reverse=True)

[(' ', 1063),
 ('e', 677),
 ('t', 499),
 ('a', 435),
 ('i', 386),
 ('n', 344),
 ('s', 342),
 ('r', 330),
 ('o', 304),
 ('h', 253),
 ('l', 213),
 ('d', 186),
 ('c', 163),
 ('u', 119),
 ('g', 117),
 ('w', 109),
 ('f', 105),
 ('p', 102),
 ('m', 92),
 ('y', 87),
 ('b', 63),
 ('v', 56),
 ('k', 46),
 ('q', 6),
 ('x', 5),
 ('j', 5),
 ('z', 2)]

In [55]:
#词汇表类
class Vocab:
    def __init__(self,tokens=None,min_freq=0,reserved_token=None):
        if tokens is None:
            tokens = []
        if reserved_token is None:
            reserved_token = []
        #按照出现的频率进行排序
        counter = count_conpus(tokens)
        self.token_freqs = sorted(counter.items(),key=lambda x: x[1],reverse=True)
        #未知的词元索引为0
        self.unk,uniq_tokens = 0,['<unk>'] + reserved_token
        uniq_tokens += [token for token,freq in self.token_freqs if freq >= min_freq and tokens not in uniq_tokens]
        self.idx_to_token,self.token_to_idx = [],dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token]=len(self.idx_to_token) - 1
    def __len__(self):
        return len(self.idx_to_token)
    def __getitem__(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_idx.get(tokens,self.unk)
        return [self.__getitem__(token) for token in tokens]
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

In [56]:
vocab = Vocab(tokens)

In [57]:
len(vocab)

28

In [58]:
vocab.token_to_idx

{'<unk>': 0,
 ' ': 1,
 'e': 2,
 't': 3,
 'a': 4,
 'i': 5,
 'n': 6,
 's': 7,
 'r': 8,
 'o': 9,
 'h': 10,
 'l': 11,
 'd': 12,
 'c': 13,
 'u': 14,
 'g': 15,
 'w': 16,
 'f': 17,
 'p': 18,
 'm': 19,
 'y': 20,
 'b': 21,
 'v': 22,
 'k': 23,
 'q': 24,
 'x': 25,
 'j': 26,
 'z': 27}

In [59]:
for i in [0,10]:
    print('words:',tokens[i])
    print('indices:',vocab[tokens[i]])


words: ['t', 'i', 't', 'l', 'e', ' ', 't', 'h', 'e', ' ', 'e', 'c', 'h', 'o', ' ', 'o', 'f', ' ', 'a', ' ', 'd', 'y', 'i', 'n', 'g', ' ', 's', 't', 'a', 'r']
indices: [3, 5, 3, 11, 2, 1, 3, 10, 2, 1, 2, 13, 10, 9, 1, 9, 17, 1, 4, 1, 12, 20, 5, 6, 15, 1, 7, 3, 4, 8]
words: ['i', 'n', 'i', 't', 'i', 'a', 't', 'e', ' ', 't', 'h', 'e', ' ', 's', 'e', 'q', 'u', 'e', 'n', 'c', 'e', ' ', 'a', 'r', 'i', 's', ' ', 's', ' ', 'v', 'o', 'i', 'c', 'e', ' ', 'w', 'a', 's', ' ', 'c', 'a', 'l', 'm', ' ', 'b', 'e', 'l', 'y', 'i', 'n', 'g', ' ', 't', 'h', 'e', ' ', 't', 'r', 'e', 'm', 'o', 'r', ' ', 'i', 'n', ' ', 'h', 'e', 'r', ' ', 'h', 'a', 'n', 'd', 's', ' ', 'o', 'n', ' ', 't', 'h', 'e', ' ', 'v', 'i', 'e', 'w', 's', 'c', 'r', 'e', 'e', 'n', ' ', 't', 'h', 'e', ' ', 'a', 's', 't', 'e', 'r', 'o', 'i', 'd', ' ', 'h', 'a', 'n', 'g', 'a', 'r', ' ', 'b', 'a', 'y', ' ', 'd', 'o', 'o', 'r', 's', ' ', 'r', 'e', 't', 'r', 'a', 'c', 't', 'e', 'd', ' ', 'r', 'e', 'v', 'e', 'a', 'l', 'i', 'n', 'g', ' ', 't', '

In [60]:
# 整合所有的功能
def load_corpus_time_machine(max_tokens=-1):
    """返回时光机器文本数据集中的词元索引和词汇表"""
    lines = read_time_mechaine()
    tokens = tokenize(lines,'char')
    vocab = Vocab(tokens)
    #把所有文本行展平到一个列表
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus,vocab

In [61]:
corpus,vocab = load_corpus_time_machine()

In [62]:
len(corpus)

6109

In [63]:
len(vocab)

28

#### 4、序列数据采样


In [65]:
import random
import torch

In [98]:
#随机采样
def seq_data_iter_random(corpus,batch_size,num_steps):
    #考虑标签，所以-1
    num_subseqs = (len(corpus) -1) // num_steps
    #序列的起始索引
    initial_indices = list(range(random.randint(0,5),num_subseqs * num_steps,num_steps))
    print(initial_indices)
    #为了随机的效果，打乱initial_indices
    random.shuffle(initial_indices)
    def data(pos):
        return corpus[pos:pos+num_steps]
    num_batches = num_subseqs // batch_size
    for i in range(0,batch_size * num_batches,batch_size):
        initial_indices_per_batch = initial_indices[i:i+batch_size]
        #取数据
        x = [data(j) for j in initial_indices_per_batch]
        y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(x),torch.tensor(y)

In [224]:
my_seq = list(range(35))
for x,y in seq_data_iter_random(my_seq,batch_size=2,num_steps=5):
    print('x:',x,'\ny:',y)

[1, 6, 11, 16, 21, 26]
x: tensor([[21, 22, 23, 24, 25],
        [ 1,  2,  3,  4,  5]]) 
y: tensor([[22, 23, 24, 25, 26],
        [ 2,  3,  4,  5,  6]])
x: tensor([[11, 12, 13, 14, 15],
        [16, 17, 18, 19, 20]]) 
y: tensor([[12, 13, 14, 15, 16],
        [17, 18, 19, 20, 21]])
x: tensor([[ 6,  7,  8,  9, 10],
        [26, 27, 28, 29, 30]]) 
y: tensor([[ 7,  8,  9, 10, 11],
        [27, 28, 29, 30, 31]])


In [202]:
#顺序采样
def seq_data_iter_sequential(corpus,batch_size,num_steps):
    #有效tokens长度
    index = random.randint(0,num_steps)
    num_tokens = ((len(corpus) - index -1) // batch_size) * batch_size
    xs = torch.tensor(corpus[index:index + num_tokens])
    ys = torch.tensor(corpus[index + 1:index + num_tokens + 1])
    #print(xs,ys)
    xs,ys = xs.reshape(batch_size,-1),ys.reshape(batch_size,-1)
    
    num_batches = xs.shape[1] // num_steps
    for i in range(0,num_steps * num_batches,num_steps):
        x = xs[:,i:i+num_steps]
        y = ys[:,i:i+num_steps]
        yield x,y

In [194]:
random.randint(0,5)

2

In [204]:
my_seq = list(range(35))
for x,y in seq_data_iter_sequential(my_seq,batch_size=2,num_steps=5):
    print('x:',x,'\ny:',y)

x: tensor([[ 5,  6,  7,  8,  9],
        [19, 20, 21, 22, 23]]) 
y: tensor([[ 6,  7,  8,  9, 10],
        [20, 21, 22, 23, 24]])
x: tensor([[10, 11, 12, 13, 14],
        [24, 25, 26, 27, 28]]) 
y: tensor([[11, 12, 13, 14, 15],
        [25, 26, 27, 28, 29]])


In [225]:
#把两个采样函数包装到类中，方便后续使用
class SeqDataLoader:
    def __init__(self,batch_size,num_steps,use_random_iter,max_tokens):
        if use_random_iter:
            self.data_iter_fn = seq_data_iter_random
        else:
            self.data_iter_fn = seq_data_iter_sequential
        self.corpus,self.vocab = load_corpus_time_machine(max_tokens)
        self.batch_size,self.num_steps = batch_size,num_steps
    def __iter__(self):
        return self.data_iter_fn(self.corpus,self.batch_size,self.num_steps)

In [226]:
def load_data_time_matchine(batch_size,num_steps,use_random_iter=False,max_tokens=10000):
    data_iter = SeqDataLoader(batch_size,num_steps,use_random_iter,max_tokens)
    return data_iter,data_iter.vocab

In [227]:
batch_size,num_steps = 2,10

In [228]:
loader,vocab = load_data_time_matchine(batch_size,num_steps=num_steps)

In [229]:
for i in loader:
    print(i)
    break

(tensor([[ 2, 13, 10,  9,  1,  9, 17,  1,  4,  1],
        [ 5,  6,  2,  1,  4,  6, 12,  1, 16,  2]]), tensor([[13, 10,  9,  1,  9, 17,  1,  4,  1, 12],
        [ 6,  2,  1,  4,  6, 12,  1, 16,  2,  1]]))
