In [3]:
!nvidia-smi

Thu Jan 28 10:00:04 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Tensorflow实现自动写诗

### 模型采用的数据格式是：寒随穷律变，春逐鸟声开。初风飘带柳，晚雪间花梅。。。，采用的训练策略是，x是取六个字符，比如：寒随穷律变，对应的y就是下一个字符：春，下一对x和y就是随穷律变，春和逐，这样以此类推，y的可能性就是词表的大小，这里大概是5500+。实质就是采用softmax做多分类

In [4]:
# 引入需要的工具库
import numpy as np
import random
import os
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras import Input, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LambdaCallback,ModelCheckpoint

In [5]:
!mkdir -p out

In [6]:
# 查看一下数据前面几行的内容
!head -5 poetry.txt

首春:寒随穷律变，春逐鸟声开。初风飘带柳，晚雪间花梅。碧林青旧竹，绿沼翠新苔。芝田初雁去，绮树巧莺来。
初晴落景:晚霞聊自怡，初晴弥可喜。日晃百花色，风动千林翠。池鱼跃不同，园鸟声还异。寄言博通者，知予物外志。
初夏:一朝春夏改，隔夜鸟花迁。阴阳深浅叶，晓夕重轻烟。哢莺犹响殿，横丝正网天。珮高兰影接，绶细草纹连。碧鳞惊棹侧，玄燕舞檐前。何必汾阳处，始复有山泉。
度秋:夏律昨留灰，秋箭今移晷。峨嵋岫初出，洞庭波渐起。桂白发幽岩，菊黄开灞涘。运流方可叹，含毫属微理。
仪鸾殿早秋:寒惊蓟门叶，秋发小山枝。松阴背日转，竹影避风移。提壶菊花岸，高兴芙蓉池。欲知凉气早，巢空燕不窥。


In [7]:
# 定义配置类
class ModelConfig(object):
    poetry_file = 'poetry.txt'
    weight_file = 'model/poetry_model.h5'
    max_len = 6
    batch_size = 32
    learning_rate = 0.003

# 定义文件读取函数
def preprocess_data(ModelConfig):
    # 首先将所有的诗取出来组成一个字符串
    files_content = ''
    with open(ModelConfig.poetry_file, 'r',encoding='UTF-8') as f:
        for line in f:
            # #每一行是一首诗，取出之后最后加上]号，表示结尾。
            x = line.strip() + "]"    
            # 取出具体诗的内容
            x = x.split(":")[1]
            # 根据长度过滤脏数据，如果诗的长度小于5，那么不采用，直接跳过这行
            if len(x) <= 5 :
                continue
            # 过滤出五言绝句，通过判断第6个字符是不是逗号
            if x[5] == '，':
                #所有的字符串连接在一起
                files_content += x
            
    # 字频统计,字符串外用list函数，会把它转化成一个列表，列表按字分割，每个字的类别还是字符串
    words = sorted(list(files_content))
    counted_words = {}
    for word in words:
        if word in counted_words:
            counted_words[word] += 1
        else:
            counted_words[word] = 1

    # 低频字过滤，如果出现次数小于等于2，就把这个字删除
    delete_words = []
    for key in counted_words:
        if counted_words[key] <= 2:
            delete_words.append(key)
    for key in delete_words:
        del counted_words[key]
    # 返回可遍历的(键, 值) 元组数组。并且按照出现次数从大到小排列
    wordPairs = sorted(counted_words.items(), key=lambda x: -x[1])
    # 取出所有的字
    words, _ = zip(*wordPairs)
    words += (" ",)
    
    # 构建 字到id的映射字典 与 id到字的映射字典
    word2idx = dict((c, i) for i, c in enumerate(words))
    idx2word = dict((i, c) for i, c in enumerate(words))
    word2idx_dic = lambda x: word2idx.get(x, len(words) - 1)  #这是一个函数，可以传一个字符串参数，如果指定的值不存在，返回最后一个也就是空字符串对应的数字
    return word2idx_dic, idx2word, words, files_content

In [48]:
class LSTMPoetryModel(object):
    def __init__(self, config):
        self.model = None
        self.do_train = True
        self.loaded_model = True
        self.config = config

        # 诗歌训练文件预处理
        self.word2idx_dic, self.idx2word, self.words, self.files_content = preprocess_data(self.config)
        
        # 诗列表
        self.poems = self.files_content.split(']')
        # 诗的总数量
        self.poems_num = len(self.poems)
        
        # 如果有预训练好的模型文件，则直接加载模型，否则开始训练
        if os.path.exists(self.config.weight_file) and self.loaded_model:
            self.model = load_model(self.config.weight_file)
        else:
            self.train()

    def build_model(self):
        '''LSTM模型构建'''
        print('模型构建中...')

        # 输入的维度
        input_tensor = Input(shape=(self.config.max_len, len(self.words))) #输入是one-hot形式，这里是max_len*len(self.words)的矩阵，当作是嵌入好的，后面直接接LSTM
        lstm = LSTM(512, return_sequences=True)(input_tensor)
        dropout = Dropout(0.6)(lstm)
        lstm = LSTM(256)(dropout)
        dropout = Dropout(0.6)(lstm)
        dense = Dense(len(self.words), activation='softmax')(dropout)
        self.model = Model(inputs=input_tensor, outputs=dense)
        optimizer = Adam(lr=self.config.learning_rate)
        self.model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    def sample(self, preds, temperature=1.0):
        '''
        temperature可以控制生成诗的创作自由约束度
        当temperature<1.0时，模型会做一些随机探索，输出相对比较新的内容
        当temperature>1.0时，模型预估方式偏保守
        在训练的过程中可以看到temperature不同，结果也不同
        就是一个概率分布变换的问题，保守的时候概率大的值变得更大，选择的可能性也更大
        '''
        preds = np.asarray(preds).astype('float64')
        exp_preds = np.power(preds,1./temperature)
        preds = exp_preds / np.sum(exp_preds)
        prob = np.random.choice(range(len(preds)),1,p=preds)
        return int(prob.squeeze())
    
    def generate_sample_result(self, epoch, logs):
        '''训练过程中，每5个epoch打印出当前的学习情况'''
        if epoch % 5 != 0:
            return
        
        # 追加模式添加内容
        with open('out/out.txt', 'a',encoding='utf-8') as f:
            f.write('==================第{}轮=====================\n'.format(epoch))
                
        print("\n==================第{}轮=====================".format(epoch))
        for diversity in [0.7, 1.0, 1.3]:
            print("------------设定诗词创作自由度约束参数为{}--------------".format(diversity))
            generate = self.predict_random(temperature=diversity)
            print(generate)
            
            # 训练时的预测结果写入txt
            with open('out/out.txt', 'a',encoding='utf-8') as f:
                f.write(generate+'\n')
    
    def predict_random(self,temperature = 1):
        '''预估模式1：随机从库中选取一句开头的诗句，生成五言绝句'''
        if not self.model:
            print('没有预训练模型可用于加载！')
            return
        
        index = random.randint(0, self.poems_num)
        sentence = self.poems[index][: self.config.max_len]
        generate = self.predict_sen(sentence,temperature=temperature)
        return generate
    
    def predict_first(self, char,temperature =1):
        '''预估模式2：根据给出的首个字，生成五言绝句'''
        if not self.model:
            print('没有预训练模型可用于加载！')
            return
        
        index = random.randint(0, self.poems_num)
        # 选取随机一首诗的最后max_len个字+给出的首个文字作为初始输入
        sentence = self.poems[index][1-self.config.max_len:] + char
        generate = str(char)
        # 预测后面23个字
        generate += self._preds(sentence,length=23,temperature=temperature)
        return generate
    
    def predict_sen(self, text,temperature =1):
        '''预估模式3：根据给出的前max_len个字，生成诗句'''
        '''此例中，即根据给出的第一句诗句（含逗号），来生成古诗'''
        if not self.model:
            return
        max_len = self.config.max_len
        if len(text)<max_len:
            print('给出的初始字数不低于 ',max_len)
            return

        sentence = text[-max_len:]
        print('第一行为:',sentence)
        generate = str(sentence)
        generate += self._preds(sentence,length = 24-max_len,temperature=temperature)
        return generate
    
    def predict_hide(self, text,temperature = 1):
        '''预估模式4：根据给4个字，生成藏头诗五言绝句'''
        if not self.model:
            print('没有预训练模型可用于加载！')
            return
        if len(text)!=4:
            print('藏头诗的输入必须是4个字！')
            return
        
        index = random.randint(0, self.poems_num)
        # 选取随机一首诗的最后max_len个字+给出的首个文字作为初始输入
        sentence = self.poems[index][1-self.config.max_len:] + text[0]
        generate = str(text[0])
        print('第一行为 ',sentence)
        
        for i in range(5):
            next_char = self._pred(sentence,temperature)           
            sentence = sentence[1:] + next_char
            generate+= next_char
        
        for i in range(3):
            generate += text[i+1]
            sentence = sentence[1:] + text[i+1]
            for i in range(5):
                next_char = self._pred(sentence,temperature)           
                sentence = sentence[1:] + next_char
                generate+= next_char

        return generate
    
    
    def _preds(self,sentence,length = 23,temperature =1):
        '''
        供类内部调用的预估函数，输入max_len长度字符串，返回length长度的预测值字符串
        sentence:预测输入值
        lenth:预测出的字符串长度
        '''
        sentence = sentence[:self.config.max_len]
        generate = ''
        for i in range(length):
            pred = self._pred(sentence,temperature)
            generate += pred
            sentence = sentence[1:]+pred
        return generate
        
        
    def _pred(self,sentence,temperature =1):
        '''供类内部调用的预估函数，根据一串输入，返回单个预测字符'''
        if len(sentence) < self.config.max_len:
            print('in def _pred,length error ')
            return
        
        sentence = sentence[-self.config.max_len:]
        x_pred = np.zeros((1, self.config.max_len, len(self.words)))
        for t, char in enumerate(sentence):
            x_pred[0, t, self.word2idx_dic(char)] = 1.
        preds = self.model.predict(x_pred, verbose=0)[0]
        next_index = self.sample(preds,temperature=temperature)
        next_char = self.idx2word[next_index]
        
        return next_char

    def data_generator(self):
        '''生成器生成数据'''
        # 注意这个生成器一次只生成一对x和y，而不是batch_size对，所以后面训练设置的参数也有改变。
        i = 0
        while 1:
            x = self.files_content[i: i + self.config.max_len]
            y = self.files_content[i + self.config.max_len]

            if ']' in x or ']' in y:
                i += 1
                continue

            y_vec = np.zeros(
                shape=(1, len(self.words)),
                dtype=np.bool
            )
            y_vec[0, self.word2idx_dic(y)] = 1.0

            x_vec = np.zeros(
                shape=(1, self.config.max_len, len(self.words)),
                dtype=np.bool
            )

            for t, char in enumerate(x):
                x_vec[0, t, self.word2idx_dic(char)] = 1.0

            yield x_vec, y_vec
            i += 1

    def train(self):
        '''训练模型'''
        print('开始训练...')
        number_of_epoch = len(self.files_content)-(self.config.max_len + 1)*self.poems_num
        number_of_epoch /= self.config.batch_size 
        number_of_epoch = int(number_of_epoch / 1.5)
        print('总迭代轮次为 ',number_of_epoch)
        print('总诗词数量为 ',self.poems_num)
        print('文件内容的长度为 ',len(self.files_content))

        if not self.model:
            self.build_model()

        self.model.fit_generator(  
            generator=self.data_generator(),
            verbose=True,
            steps_per_epoch=self.config.batch_size, #当生成器返回steps_per_epoch次数据时计一个epoch结束，执行下一个epoch，这里传入batch_size,也就是每训练32个数据，就认为训练完了一步。
            epochs=10000,  #number_of_epoch
            callbacks=[
                ModelCheckpoint(self.config.weight_file, save_weights_only=False),
                LambdaCallback(on_epoch_end=self.generate_sample_result)
            ]
        )

In [None]:
model = LSTMPoetryModel(ModelConfig)

print('预训练模型加载成功！')


------------设定诗词创作自由度约束参数为0.7--------------
第一行为: 却上南山路，
却上南山路，千新对，谁。新恨然。归，妾知柳，锦。
------------设定诗词创作自由度约束参数为1.0--------------
第一行为: 河水昏复晨，
河水昏复晨，须飞金君入。怀见惟年太，玉傥关渡清。
------------设定诗词创作自由度约束参数为1.3--------------
第一行为: 北虏胶堪折，
北虏胶堪折，玉处飞丈应。国拂垂太白，相万不军委。
Epoch 1892/10000
Epoch 1893/10000
Epoch 1894/10000
Epoch 1895/10000

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-49-ebb8a5d4ac1a>", line 1, in <module>
    model = LSTMPoetryModel(ModelConfig)
  File "<ipython-input-48-e11c8b9aaec8>", line 20, in __init__
    self.train()
  File "<ipython-input-48-e11c8b9aaec8>", line 222, in train
    LambdaCallback(on_epoch_end=self.generate_sample_result)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 1861, in fit_generator
    initial_epoch=initial_epoch)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 1100, in fit
    tmp_logs = self.train_function(iterator)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py", line 828, in __call__
    result = self._call(*args, **kwds)
  File "/usr/local/lib/python3.6/dist-packa

KeyboardInterrupt: ignored

In [2]:
for i in range(3):
    #藏头诗
    sen = model.predict_hide('风起云涌')
    print(sen)

NameError: ignored