In [None]:
# 安装依赖
!pip install -U pip
!pip install -U dill
!pip install -U nltk==3.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-23.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.6
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nltk==3.4
  Downloadi

- 本notebook中使用 lm 和 nltk 进行 N-Gram 模型的训练和生成
- 导入依赖
- 我们会从简单的句子开始了解如何使用nltk进行分词和分词预处理的操作
- 之后我们会引入真实的数据集进行操作和训练

In [None]:

from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

- 如果想要训练一个 2-gram 模型，首先需要将一段文本转化为 2-gran 的格式
- 看一个例子

In [None]:

text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]
list(bigrams(text[0]))

[('a', 'b'), ('b', 'c')]

In [None]:
list(ngrams(text[1], n = 3)) # 3-Gram 进行分词

[('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'e'), ('c', 'e', 'f')]

- 在文本text1中，我们发现分词之后的文本组，b作为分词组的head和tail分别出现了两次，但是 a 和 c 则只出现了一次
- 是否有办法将 a 和 c 作为原句中 head 和 tail 的关系表达出来呢？
- 可以通过使用 padding 符号的方式补全原句中的开头和结尾，补全后再进行分词操作
- padding 符号中使用`<s>` 作为head标签，`</s>`作为tail标签

In [None]:
# 注意n代表了将会使用n-gram进行分词，参数n决定了需要填充多少 padding symbol
# n = 2时需要填充一前一后两个symbol，n=3时则需要填充两前两后4个symbol，以此类推，
from nltk.util import pad_sequence
list(pad_sequence(text[0],
        pad_left = True, left_pad_symbol='<s>',
        pad_right = True, right_pad_symbol= '</s>',
        n=2))

['<s>', 'a', 'b', 'c', '</s>']

In [None]:
padded_sent = list(pad_sequence(text[0], 
                pad_left=True, left_pad_symbol="<s>", 
                pad_right=True, right_pad_symbol="</s>", 
                n=2))
list(ngrams(padded_sent, n=2)) # 对padding之后的文本序列进行分词

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

In [None]:
list(pad_sequence(text[0],
        pad_left=True, left_pad_symbol="<s>",
        pad_right=True, right_pad_symbol="</s>",
        n=3))
# 因为进行3-Gram的操作，所以这里填充的是两前两后                

['<s>', '<s>', 'a', 'b', 'c', '</s>', '</s>']

In [None]:
padded_sent = list(pad_sequence(text[0], 
        pad_left=True, left_pad_symbol="<s>", 
        pad_right=True, right_pad_symbol="</s>", n=3))
list(ngrams(padded_sent, n=3))

[('<s>', '<s>', 'a'),
 ('<s>', 'a', 'b'),
 ('a', 'b', 'c'),
 ('b', 'c', '</s>'),
 ('c', '</s>', '</s>')]

- nltk.lm 提供了上述代码的简化版本

In [None]:
from nltk.lm.preprocessing import pad_both_ends
list(pad_both_ends(text[0],n = 2)) # 2-Gram 填充

['<s>', 'a', 'b', 'c', '</s>']

In [None]:
list(bigrams(pad_both_ends(text[0],n=2))) # 2-Gram 分词

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

- 通过NLTK，我们可以更加便捷的方式处理N-Gram，而不需要通过指定N作为具体的参数进行分词

In [None]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(text[0], n = 2))
list(everygrams(padded_bigrams,max_len=2)) # 除了可以生成1-gram，还可以生成2-Gram

[('<s>',),
 ('a',),
 ('b',),
 ('c',),
 ('</s>',),
 ('<s>', 'a'),
 ('a', 'b'),
 ('b', 'c'),
 ('c', '</s>')]

In [None]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(text[0], n = 3))
list(everygrams(padded_bigrams,max_len=3)) # 除了可以生成1-gram，还可以生成2-Gram 和 3-Gram

[('<s>',),
 ('<s>',),
 ('a',),
 ('b',),
 ('c',),
 ('</s>',),
 ('</s>',),
 ('<s>', '<s>'),
 ('<s>', 'a'),
 ('a', 'b'),
 ('b', 'c'),
 ('c', '</s>'),
 ('</s>', '</s>'),
 ('<s>', '<s>', 'a'),
 ('<s>', 'a', 'b'),
 ('a', 'b', 'c'),
 ('b', 'c', '</s>'),
 ('c', '</s>', '</s>')]

- 在模型的训练和evaluation 期间，模型高度依赖于基于数据集的词汇表，因此，我们还需要定义模型的词汇表

In [None]:
from nltk.lm.preprocessing import flatten
list(flatten(pad_both_ends(sent, n=2) for sent in text)) # 将text中的文本转化为词汇表，但是不做去重处理，这里是生成corpus的过程，而不是支持字典表的过程

['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

- 上述过程我们分别进行了三个步骤的操作
1. 将文本句子基于 N-gram 进行padding补齐
2. 将句子进行分词
3. 将句子的构成转化为词汇表

- 有没有什么办法可以让这三部过程更加简单？

In [None]:
from nltk.lm.preprocessing import padded_everygram_pipeline 
# 分别生成N-Gram的文本分词结果和词汇表
# 生成的结果都是 iterators 对象
training_ngrams, padded_sentences = padded_everygram_pipeline(2, text)
for ngramlize_sent in training_ngrams:
    print('分词结果:',list(ngramlize_sent))
    print()
print('--------------split line---------------')
print('词汇表',list(padded_sentences))

分词结果: [('<s>',), ('a',), ('b',), ('c',), ('</s>',), ('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

分词结果: [('<s>',), ('a',), ('c',), ('d',), ('c',), ('e',), ('f',), ('</s>',), ('<s>', 'a'), ('a', 'c'), ('c', 'd'), ('d', 'c'), ('c', 'e'), ('e', 'f'), ('f', '</s>')]

--------------split line---------------
词汇表 ['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']


- 进入真实数据环节

In [None]:
try:  # 使用默认的NLTK分词器
    from nltk import word_tokenize, sent_tokenize 
    # 确认分词器是否可用，在某些机器上可能会因为setpup的问题而invalid
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # 在上述工具无法使用的时候，使用原始的分词器方式进行分词
    import re
    from nltk.tokenize import ToktokTokenizer
    # 具体原因可查看 https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # ToktokTokenizer 是内建的分词器，所以无需引入即可使用
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

In [None]:
import os
import requests
import io #codecs
# 下载文本数据
url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
text = requests.get(url).content.decode('utf8')
with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
    fout.write(text)

In [None]:
# 查看下载下来的文本

print(text[:500])

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish 


In [None]:
# 将文本进行标签化处理

tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]
len(tokenized_text)

155

In [None]:
tokenized_text[0]

['language',
 'is',
 'never',
 ',',
 'ever',
 ',',
 'ever',
 ',',
 'random',
 'adam',
 'kilgarriff',
 'abstract',
 'language',
 'users',
 'never',
 'choose',
 'words',
 'randomly',
 ',',
 'and',
 'language',
 'is',
 'essentially',
 'non-random',
 '.']

In [None]:
# 使用every-grams 上限为3作为参数，生成分词数据
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

- 这里我们使用最大似然估计 MLE 的方式进行模型的训练
- MLE的初始化使用N-Gram的N值进行

In [None]:
from nltk.lm import MLE
model = MLE(n) # Lets train a 3-grams model, previously we set n=3

In [None]:
# 初始化后MLE创建一个空的字典表，注意不是词汇表，是字典表

len(model.vocab)

0

In [None]:
model.fit(train_data, padded_sents)

In [None]:
print(model.vocab) # 未知的文本使用 UNK 进行代替

<Vocabulary with cutoff=1 unk_label='<UNK>' and 1429 items>


In [None]:
print(model.vocab.lookup(tokenized_text[0]))

('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')


In [None]:
# 当我们查找训练集中不存在的词汇时，会返回一个 <UNK> 标签作为代替

print(model.vocab.lookup('language is never random lah .'.split()))

('language', 'is', 'never', 'random', '<UNK>', '.')


In [None]:
print(model.counts) # 基于3—Gram进行分词

<NgramCounter with 3 ngram orders and 18687 ngrams>


In [None]:
model.counts['language'] # 计算language这个字在corpus中的数量

25

In [None]:
# 计算‘language is' 这个 2-Gram 短语的出现频次
model.counts[['language']]['is'] #  Count('is'|'language')

11

In [None]:
# 计算‘language is never' 这个 3-Gram 短语出现的频次
model.counts[['language', 'is']]['never'] # Count('never'|'language is')

7

- 训练语言模型的真正目的是让它对特定上下文中的单词概率进行评分
- 从而帮助我们选择更合适的候选词
- MLE 中会返回相关频率作为得分

In [None]:
model.score('language') # P('language')

0.003916040100250626

In [None]:
model.score('is', 'language'.split())  # P('is'|'language')

0.44

In [None]:
model.score('never', 'language is'.split())  # P('never'|'language is')

0.6363636363636364

- 需要注意的是，没有出现文本数据集中的单词会被识别为 `<UNK>`

In [None]:
model.score("<UNK>") == model.score("lah")

True

In [None]:
model.score("<UNK>") == model.score("leh")

True

In [None]:
model.score("<UNK>") == model.score("lor")

True

## 使用 N-Gram 生成文本

In [None]:
print(model.generate(20, random_seed=7)) # 让模型生成20个字的文本序列

['ate', 'inferences', 'are', 'drawn.', '2', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']


- 生成的文本存在大量无意义的head tail symbol
- 对生成的过程可以进行一下处理

In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize # 对于文本的解码，将tokenize化的文本转化为正常的输出文本

def generate_sent(model, num_words, random_seed = 42):
  # model 接收 N-Grammodel
  # num_words 期待生产的文本长度
  # random_seed 随机种子
  content = []
  for token in model.generate(num_words, random_seed=random_seed):
    if token == '<s>':
      continue
    if token == '</s>':
      break

    content.append(token)
  return detokenize(content)

generate_sent(model,20,random_seed=7)

'ate inferences are drawn. 2.'

In [None]:
print(model.generate(28, random_seed=0))

['the', 'trouble', 'with', 'quantitative', 'studies', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']


In [None]:
generate_sent(model, 20, random_seed=1)

'29⫺50. manning, christopher and hinrich schütze 1999 foundations of statistical independence.'

In [None]:
generate_sent(model, 20, random_seed=30)

'information glut, is inappropriate, particularly where counts are low.'

- 虽然文本生成了，但是文本生成的效果只能说一言难尽

## 使用特朗普的推特进行文本生成

- [数据集下载地址](https://www.kaggle.com/datasets/kingburrito666/better-donald-trump-tweets)

In [None]:
import pandas as pd
df = pd.read_csv('Donald-Tweets!.csv')
df.head()

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,
2,16-11-11,11:14:20,Love the fact that the small groups of protest...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,
3,16-11-11,2:19:44,Just had a very open and successful presidenti...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,
4,16-11-11,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,


In [None]:
# 我们需要使用 twitter 正文
trump_corpus = list(df['Tweet_Text'].apply(word_tokenize))
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, trump_corpus)

In [None]:
from nltk.lm import MLE
trump_model = MLE(n) # Lets train a 3-grams model, previously we set n=3
trump_model.fit(train_data, padded_sents)

In [None]:
generate_sent(trump_model, num_words=20, random_seed=42)

'do so many people on television. Just another desperate move by the media pile on against me in Rome ,'

In [None]:
generate_sent(trump_model, num_words=50, random_seed=10)

'and many other subjects! Bad times for divided USA! +Israel2 "'

- N-Gram 作为一个基于统计的语言模型，他的文本生成能力不要抱有太大的期望