In [21]:
import collections
import hashlib
import requests
import re
from dltool import dataprocess, train
import logging
import os

# 读取数据

In [22]:
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
DATA_HUB['time_machine'] = (DATA_URL + 'timemachine.txt',
                            '090b5e7e70c295757f55df93cb0a180b9691891a')

In [23]:
def download(name, cache_dir=os.path.join('..','data')):
  """下载一共DATA_HUB中的文件，返回本地文件名"""
  assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
  url, sha1_hash = DATA_HUB[name]
  os.makedirs(cache_dir, exist_ok=True)
  fname = os.path.join(cache_dir, url.split('/')[-1])
  if os.path.exists(fname):
    sha1 = hashlib.sha1()
    with open(fname, 'rb') as f :
      while True:
        data = f.read(1048576)
        if not data:
          break
        sha1.update(data)
    if sha1.hexdigest() == sha1_hash:
      return fname #命中缓存
  print(f'正在从{url}下载{fname}...')
  r = requests.get(url, stream=True, verify=True)
  with open(fname, 'wb') as f:
    f.write(r.content)
  return fname

In [24]:


def read_time_machine():
  """将时间机器数据集加载到文本行的列表中"""
  with open(download('time_machine'),'r') as f:
    lines = f.readlines()
  return [re.sub('[^A-Za-z]+',' ',line).strip().lower() for line in lines]

lines = read_time_machine()
print(f'# 文本总行数: {len(lines)}')
print(lines[0])
print(lines[10])

# 文本总行数: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


# 词云化

In [25]:
def tokenize(lines, token='word'):
  """将文本行拆分为单词或字符词元"""
  if token == 'word':
    return [line.split() for line in lines]
  elif token == 'char':
    return [list(line) for line in lines]
  else:
    print('错误：未知词元类型：' + token)

tokens = tokenize(lines)
for i in range(11):
  print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


# 词表
字符串→数字，数字方便模型使用。
统计唯一词元，根据出现频率编号，分配索引；出现比较少的直接剔除，降低复杂性。

In [26]:
class Vocab:
  """文本词表"""
  def __init__(self, tokens=None, min_freq=0,reserved_tokens=None):
    if tokens is None:
      tokens = []
    if reserved_tokens is None:
      reserved_tokens = []
    # 按出现频率排序
    counter = count_corpus(tokens)
    self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    # 未知词元的索引为0
    self.idx_to_token = ['<unk>'] + reserved_tokens
    self.token_to_idx = {token: idx
                         for idx, token in enumerate(self.idx_to_token)}
    for token , freq in self._token_freqs:
      if freq < min_freq:
        break
      if token not in self.token_to_idx:
        self.idx_to_token.append(token)
        self.token_to_idx[token] = len(self.idx_to_token) - 1
  
  def __len__(self):
    return len(self.idx_to_token)

  def __getitem__(self, tokens):
    if not isinstance(tokens, (list, tuple)):
      return self.token_to_idx.get(tokens, self.unk)
    return [self.__getitem__(token) for token in tokens]

  def to_tokens(self, indices):
    if not isinstance(indices, (list, tuple)):
      return self.idx_to_token[indices]
    return [self.idx_to_token[index] for index in indices]
  
  def unk(self): # 未知词元的索引为0
    return 0
  
  def token_freqs(self):
    return self._token_freqs
  
def count_corpus(tokens):
  """统计词云的频率"""
  if len(tokens) == 0 or isinstance(tokens[0],list):
    # 展平
    tokens = [token for line in tokens for token in line]
  return collections.Counter(tokens)

In [27]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [30]:
def load_corpus_time_machine(max_tokens=-1):
  """返回时光机数据集的词云索引列表和词表"""
  lines = read_time_machine()
  tokens = tokenize(lines,'char')
  vocab = Vocab(tokens)
  # 因为时光机器数据集的每个文本行不一定是一个句子/段落，将所有文本行展平到列表
  corpus = [vocab[token] for line in tokens for token in line]
  if max_tokens > 0:
    corpus = corpus[:max_tokens]
  return corpus, vocab

corpus, vocab = load_corpus_time_machine()
# len(corpus), len(vocab)

<__main__.Vocab at 0x7eff0e18e370>