##### Copyright 2018 The TensorFlow Hub Authors.

Licensed under the Apache License, Version 2.0 (the "License");

In [0]:
# Copyright 2018 The TensorFlow Hub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

## Predict vtuber danmaku with Cloud TPUs and Keras
#### Modified from "Predict Shakespeare with Cloud TPUs and Keras"
Author github ID: pren1, coco401, simon3000, Afanyiyu

## Overview

This example uses [tf.keras](https://www.tensorflow.org/guide/keras) to build a *language model* and train it on a Cloud TPU. This language model predicts the next character of text given the text so far. The trained model can generate new snippets of text that read in a similar style to the text training data.

The model trains for 10 epochs and completes in approximately 1 hour.

## Instructions

<h3>  &nbsp;&nbsp;Train on TPU&nbsp;&nbsp; <a href="https://cloud.google.com/tpu/"><img valign="middle" src="https://raw.githubusercontent.com/GoogleCloudPlatform/tensorflow-without-a-phd/master/tensorflow-rl-pong/images/tpu-hexagon.png" width="50"></a></h3>

   1. On the main menu, click Runtime and select **Change runtime type**. Set "TPU" as the hardware accelerator.
   1. Click Runtime again and select **Runtime > Run All**. You can also run the cells manually with Shift-ENTER. 

TPUs are located in Google Cloud, for optimal performance, they read data directly from Google Cloud Storage (GCS)

## Data, model, and training

In this file, you train the model utilizing the danmaku data shown below:

<blockquote>
["完事","了","这","是","？"],

["来","了"],

["哇","我","刚","忙","完","o","r","z"]
</blockquote>


### Download data
You use snippets from this file as the *training data* for the model. The *target* snippet is offset by one character.

In [0]:
# !gdown https://drive.google.com/uc?id=1QWBjb9vk8TZhc9tZqxV1-BbVTj0GlMBy
# !gdown https://drive.google.com/uc?id=1i6JH7x7SsAFYYX_EU1z5DHr1YQKeVyzN

# !gdown https://drive.google.com/uc?id=1bRf5YnXh8dkLwqgz4IdqaxRMKmzq0pxI
# !gdown https://drive.google.com/uc?id=1jEo1ObjoHqI0JuPsCQuRndxw6xIPxjoR
# !gdown https://drive.google.com/uc?id=1tBO5Bxfu3FRLLuudIQ_xHvi0t6LfO34m

# !gdown https://drive.google.com/uc?id=1DEVIMMeCLqtsiOKA3TgX2KRSDdvjhuYr
# !gdown https://drive.google.com/uc?id=1T5OpFmiT00MFZYNHyGFXqpEcLvdIBVOr

# !gdown https://drive.google.com/uc?id=169jYxkPev2lkfMy8eu497EuukLxXcMF-


# !gdown https://drive.google.com/uc?id=1V5juWnxQXwOOxJarxJ0V8-nVPF87QshX
# !gdown https://drive.google.com/uc?id=1B0UaIeixggEg30SUw3uvWxDGbabJo9NV
# !gdown https://drive.google.com/uc?id=1QLl2kqsPDoWhbmM22N9bdt1gOsuLRAdv

'128'
# !gdown https://drive.google.com/uc?id=12taxnPvsqgaQuJDW9d1Pqzsz7ykR5rub
# !gdown https://drive.google.com/uc?id=1HuJT-GXvgZUs8turPoKP2tEvZJEc_uNH
# !gdown https://drive.google.com/uc?id=1GHG5S-LiEI-hCp47hxt6whZVbgqv3V8k

# !gdown https://drive.google.com/uc?id=1CPRLIVGlCwG2OshG1Ix5PGFO6z2G8pzc

# !gdown https://drive.google.com/uc?id=11oycRZUgPN3eFgQy_ZvADUwv9IzUEt0g
# !gdown https://drive.google.com/uc?id=1wRVWnrJJPXz4E6I8x2sIoxlOFdWu1uMJ


'512'
# !gdown https://drive.google.com/uc?id=1yzco4NHi7pCb9RLCT_rPWzv-SxjRvvrN
# !gdown https://drive.google.com/uc?id=11hhsqbAvSC-qxo1eoiV2jUqDK2Ta-fuB
# !gdown https://drive.google.com/uc?id=1AOtc7nFGPr34YoS7uUkmSovVOfSenKMF

'512 new'
# !gdown https://drive.google.com/uc?id=1zEy1FI8IJJNPqbF_u4fw1LYb6LRJFcR8
# !gdown https://drive.google.com/uc?id=11KWv0drUpoEcJto6lkrw3QJRxWFXsxdK
# !gdown https://drive.google.com/uc?id=1BNho7u9E3bpRnIUrkJzyqFxHg1xgtFCW

'512 large'
!gdown https://drive.google.com/uc?id=1VNrzw-NOCVHp2J2dQIgs9Esd2hNi-UP9
!gdown https://drive.google.com/uc?id=1UHAUWW-et7dm3yxPL-D6zjEc4g-MTU97
!gdown https://drive.google.com/uc?id=1_R_FiqrlVdZlN2Fy8xHx2bwq9ZYyu31o

# https://drive.google.com/file/d/1VNrzw-NOCVHp2J2dQIgs9Esd2hNi-UP9/view?usp=sharing
# https://drive.google.com/file/d/1UHAUWW-et7dm3yxPL-D6zjEc4g-MTU97/view?usp=sharing
# https://drive.google.com/file/d/1_R_FiqrlVdZlN2Fy8xHx2bwq9ZYyu31o/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=1VNrzw-NOCVHp2J2dQIgs9Esd2hNi-UP9
To: /content/fin_fresh_512.json
1.65GB [00:15, 109MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UHAUWW-et7dm3yxPL-D6zjEc4g-MTU97
To: /content/glove-512-words.pkl
100% 166k/166k [00:00<00:00, 58.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_R_FiqrlVdZlN2Fy8xHx2bwq9ZYyu31o
To: /content/glove-512.npy
22.2MB [00:00, 104MB/s] 


In [0]:
# !gdown https://drive.google.com/uc?id=1FFSNsBeFlevU8v3xfQCUvDCBNSuP52PL

## Process the data with index

In [0]:
import numpy as np
import tensorflow as tf
import os
import pdb
import collections
import distutils
from tqdm import tqdm
import pickle
from keras.utils import plot_model

# 'the character should occur this much time if they wanna to be taken into account'
# minimum_occur_time = 100
context_vector_length = 100
context_seq_length = 130
batch_size = 2048
# SHAKESPEARE_TXT = '/content/bert-master_danmaku_text_pure.txt'
# 'Use the following path to just save you some time'
# preprocessed_TXT = '/content/rectified_input.txt'
# characters, rare_characters, input_text = process_data_with_index(SHAKESPEARE_TXT, minimum_occur_time)

'load in characters, and embedding matrix'
with open('/content/glove-512-words.pkl', 'rb') as f:
    characters = pickle.load(f)
    'also add end part, and beginning part'
    '成雨'
    characters[-1] = 'eos'
    '効'
    characters[-2] = '\n'
# preprocessed_TXT = '/content/new_filtered_data.json'
preprocessed_TXT = '/content/fin_fresh_512.json'
# preprocessed_TXT = '/content/fbk_fine_tune.json'
embedding_matrix = np.load('/content/glove-512.npy')
'show something about embedding'

char_to_n = {char:n for n, char in enumerate(characters)}
n_to_char = {n:char for n, char in enumerate(characters)}

def transform(txt):
    return np.asarray([char_to_n[c] for c in txt], dtype=np.int32)

# def remove_unkown_character_from_text(txt, rare_characters):
#     'Remove char in rare_characters from txt' 
#     for x in tqdm(rare_characters):
#         try:
#             txt = txt.replace(x, "")
#         except ValueError:
#             pass
#     return txt

Using TensorFlow backend.


In [0]:
# This address identifies the TPU we'll use when configuring TensorFlow.
TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
import json
def input_fn(seq_len=context_seq_length, batch_size=batch_size):
  """Return a dataset of source and target sequences for training."""
  with open(preprocessed_TXT, encoding='UTF-8') as json_file:
    data = json.load(json_file, encoding='UTF-8')
    # data = data[:100000]
    'process the data'
    txt = []
    label_part = []
    for single_meg in tqdm(data):
      single_meg[0].insert(0, 'eos')
      single_meg[0].append('\n')
      label = [single_meg[1]] * len(single_meg[0])
      txt.extend(single_meg[0])
      label_part.extend(label)
    # occur_index =[i for i in range(len(txt)) if txt[i] in ['口呆口', 'magnet']]
    
    new_txt = []
    new_label_part = []
    for (single, label) in tqdm(zip(txt, label_part)):
      if single not in  ['成雨', '効']:
        new_txt.append(single)
        new_label_part.append(label)
    txt = tf.constant(transform(new_txt), dtype=tf.int32)
    label_part = tf.constant(new_label_part, dtype=tf.int32)
    print("Processing the txt: {}, with label: {}".format(txt[1000:1020], label_part[1000:1020]))
    'If the input is preprocessed_TXT, then you do not need this one'
    # txt = remove_unkown_character_from_text(txt, rare_characters)
  # txt = np.asarray(txt)
  # label_part = np.asarray(label_part)
  # source = tf.constant(transform(res), dtype=tf.int32)
  
  # def generator():
  #   for txt_sig, label_sig in zip(txt, label_part):
  #     yield txt_sig, label_sig
  
  # ds = tf.data.Dataset.from_generator(generator, output_types=(tf.string, tf.int32)).batch(seq_len+1, drop_remainder=True)
  import time
  start_time = time.time()
  ds = tf.data.Dataset.from_tensor_slices((txt, label_part)).batch(seq_len+1, drop_remainder=True)
  print("--- slice tensor spends: %s seconds ---" % (time.time() - start_time))
  # ds = tf.data.Dataset.from_tensor_slices(source).batch(seq_len+1, drop_remainder=True)

  def split_input_target(chunk, label_chunk):
    context_vector = chunk[:context_vector_length]
    input_text = chunk[context_vector_length:-1]
    target_text = chunk[context_vector_length+1:]
    'Simply use the first element as the chunk label'
    label_value = label_chunk[:1]
    return (context_vector, input_text, label_value), target_text

  BUFFER_SIZE = 10000
  ds = ds.map(split_input_target).shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)
  return ds.repeat()

### Build the model

The model is defined as a two-layer, forward-LSTM, the same model should work both on CPU and TPU.

The input dimension to the Embedding layer is the same as our vocabulary size.

When specifying the arguments to the LSTM, it is important to note how the stateful argument is used. When training we will make sure that `stateful=False` because we do want to reset the state of our model between batches, but when sampling (computing predictions) from a trained model, we want `stateful=True` so that the model can retain information across the current batch and generate more interesting text.

In [0]:
EMBEDDING_DIM = 512
HALF_EMBEDDING_DIM = int(EMBEDDING_DIM/2)
regularizer_coefficient = 0.000001
dropout_rate = 0.4
def lstm_model(seq_len=30, context_length = context_vector_length, batch_size=None, stateful=True):
    """Language model: Encoder decoder favor for context term"""
    room_id_bit =  tf.keras.Input(name='room_id_bit', shape=(1,), batch_size=batch_size, dtype=tf.int32)
    one_hot_embedding_id = tf.keras.backend.one_hot(room_id_bit, num_classes = EMBEDDING_DIM)

    encoder_input = tf.keras.Input(name='Encoder_input', shape=(context_length,), batch_size=batch_size, dtype=tf.int32)
    embedding_layer = tf.keras.layers.Embedding(input_dim=len(characters), output_dim=EMBEDDING_DIM, embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix), trainable=False)
    encode_embedding = embedding_layer(encoder_input)
    'Then, we concatenate the embedding to provide more info'
    rich_info_embedding = tf.keras.layers.concatenate([encode_embedding, one_hot_embedding_id],axis=1)

    enc_lstm1, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(HALF_EMBEDDING_DIM, name='encoder_lstm_1', return_state=True, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularizer_coefficient)))(rich_info_embedding)
    state_h_1 = tf.keras.layers.concatenate([forward_h, backward_h])
    state_c_1 = tf.keras.layers.concatenate([forward_c, backward_c])
    enc_lstm1 = tf.keras.layers.Dropout(dropout_rate)(enc_lstm1)
    encoder_states_1 = [state_h_1, state_c_1]

    enc_lstm2, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(HALF_EMBEDDING_DIM, name='encoder_lstm_2', return_state=True, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularizer_coefficient)))(enc_lstm1)
    
    'Double concatenate'
    enc_lstm2 = tf.keras.layers.concatenate([enc_lstm2, one_hot_embedding_id],axis=1)

    state_h_2 = tf.keras.layers.concatenate([forward_h, backward_h])
    state_c_2 = tf.keras.layers.concatenate([forward_c, backward_c])
    encoder_states_2 = [state_h_2, state_c_2]
    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = tf.keras.Input(name='Decoder_input', shape=(seq_len,), batch_size=batch_size, dtype=tf.int32)
    decode_embedding = embedding_layer(decoder_inputs)
    lstm_1_layer = tf.keras.layers.LSTM(EMBEDDING_DIM, name='decoder_lstm_1', stateful=stateful, return_state=True, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularizer_coefficient))
    lstm_1, _, _ = lstm_1_layer(decode_embedding, initial_state=encoder_states_1)
    dropout_lstm_1 = tf.keras.layers.Dropout(dropout_rate)(lstm_1)
    lstm_2_layer = tf.keras.layers.LSTM(EMBEDDING_DIM, name='decoder_lstm_2', stateful=stateful, return_state=True, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularizer_coefficient))
    lstm_2, _, _  = lstm_2_layer(dropout_lstm_1, initial_state=encoder_states_2)
    dropout_lstm_2 = tf.keras.layers.Dropout(dropout_rate)(lstm_2)

    'try to add attention here~'
    attention = tf.keras.layers.Dot(axes=[2, 2])([dropout_lstm_2, enc_lstm2])
    attention = tf.keras.layers.Activation('softmax', name='attention')(attention)
    context = tf.keras.layers.Dot(axes=[2, 1])([attention, enc_lstm2])
    decoder_combined_context = tf.keras.layers.concatenate([context, dropout_lstm_2])
    dense_layer_1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(EMBEDDING_DIM*4, activation='tanh' , kernel_regularizer=tf.keras.regularizers.l2(regularizer_coefficient)))
    predicted_char_layer = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(characters), activation='softmax' , kernel_regularizer=tf.keras.regularizers.l2(regularizer_coefficient)))

    dense_layer_output_1 = dense_layer_1(decoder_combined_context)
    predicted_char = predicted_char_layer(dense_layer_output_1)
    
    Model = tf.keras.Model(inputs=[encoder_input, decoder_inputs, room_id_bit], outputs=[predicted_char])
    # Model.summary()
    tf.keras.utils.plot_model(Model, show_shapes=True, to_file='model.png')

    'For reference, also prepared some tricks'
    encoder_model = tf.keras.Model([encoder_input, room_id_bit], [encoder_states_1[0], encoder_states_1[1], encoder_states_2[0], encoder_states_2[1], enc_lstm2])
    tf.keras.utils.plot_model(encoder_model, show_shapes=True, to_file='encoder_model.png')
    
    decoder_state_input_h = tf.keras.Input(shape=(EMBEDDING_DIM,))
    decoder_state_input_c = tf.keras.Input(shape=(EMBEDDING_DIM,))
    decoder_state_input_h1 = tf.keras.Input(shape=(EMBEDDING_DIM,))
    decoder_state_input_c1 = tf.keras.Input(shape=(EMBEDDING_DIM,))
    
    'Add 2 for double room_id_inputs'
    encoder_output_in = tf.keras.Input(shape=(context_vector_length + 2, EMBEDDING_DIM,))
    decode_embedding = embedding_layer(decoder_inputs)
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c, decoder_state_input_h1, decoder_state_input_c1]
    d_o, state_h, state_c = lstm_1_layer(decode_embedding, initial_state=decoder_states_inputs[:2])
    d_o, state_h1, state_c1 = lstm_2_layer(d_o, initial_state=decoder_states_inputs[-2:])
    decoder_states = [state_h, state_c, state_h1, state_c1]

    'try to add attention here~'
    attention = tf.keras.layers.Dot(axes=[2, 2])([d_o, encoder_output_in])
    attention = tf.keras.layers.Activation('softmax', name='attention')(attention)
    context = tf.keras.layers.Dot(axes=[2, 1])([attention, encoder_output_in])
    decoder_combined_context = tf.keras.layers.concatenate([context, d_o])

    dense_layer_output_1 = dense_layer_1(decoder_combined_context)
    decoder_outputs = predicted_char_layer(dense_layer_output_1)
    decoder_model = tf.keras.Model([decoder_inputs] + decoder_states_inputs + [encoder_output_in], [decoder_outputs] + decoder_states)
    tf.keras.utils.plot_model(decoder_model, show_shapes=True, to_file='decoder_model.png')

    return Model, encoder_model, decoder_model

def get_stand_alone_decoder(seq_len=30, context_length = context_vector_length, batch_size=None, stateful=True):
    decoder_state_input_h = tf.keras.Input(shape=(EMBEDDING_DIM,))
    decoder_state_input_c = tf.keras.Input(shape=(EMBEDDING_DIM,))
    decoder_state_input_h1 = tf.keras.Input(shape=(EMBEDDING_DIM,))
    decoder_state_input_c1 = tf.keras.Input(shape=(EMBEDDING_DIM,))
    
    decoder_inputs = tf.keras.Input(name='Decoder_input', shape=(seq_len,), batch_size=batch_size, dtype=tf.int32)
    
    'Add 2 for room_id_inputs'
    encoder_output_in = tf.keras.Input(shape=(context_vector_length + 2, EMBEDDING_DIM,))

    embedding_layer = tf.keras.layers.Embedding(input_dim=len(characters), output_dim=EMBEDDING_DIM, embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix), trainable=False)
    decode_embedding = embedding_layer(decoder_inputs)
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c, decoder_state_input_h1, decoder_state_input_c1]
    lstm_1_layer = tf.keras.layers.LSTM(EMBEDDING_DIM, name='decoder_lstm_1', stateful=stateful, return_state=True, return_sequences=True)
    d_o, state_h, state_c = lstm_1_layer(decode_embedding, initial_state=decoder_states_inputs[:2])
    lstm_2_layer = tf.keras.layers.LSTM(EMBEDDING_DIM, name='decoder_lstm_2', stateful=stateful, return_state=True, return_sequences=True)
    d_o, state_h1, state_c1 = lstm_2_layer(d_o, initial_state=decoder_states_inputs[-2:])
    decoder_states = [state_h, state_c, state_h1, state_c1]
    
    'try to add attention here~'
    attention = tf.keras.layers.Dot(axes=[2, 2])([d_o, encoder_output_in])
    attention = tf.keras.layers.Activation('softmax', name='attention')(attention)
    context = tf.keras.layers.Dot(axes=[2, 1])([attention, encoder_output_in])
    decoder_combined_context = tf.keras.layers.concatenate([context, d_o])
    
    dense_layer_1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(EMBEDDING_DIM*4, activation='tanh'))
    predicted_char_layer = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(characters), activation='softmax'))
    
    dense_layer_output_1 = dense_layer_1(decoder_combined_context)
    decoder_outputs = predicted_char_layer(dense_layer_output_1)
    decoder_model = tf.keras.Model([decoder_inputs] + decoder_states_inputs + [encoder_output_in], [decoder_outputs] + decoder_states)
    tf.keras.utils.plot_model(decoder_model, show_shapes=True, to_file='decoder_model.png')
    return decoder_model

def step_decay(epoch):
    import math
    initial_lrate = 0.001
    drop = 0.6
    epochs_drop = 1.0
    lrate = initial_lrate * math.pow(drop, math.floor((1 + epoch) / epochs_drop))
    print("lrate: {}, epoch: {}".format(lrate, epoch))
    return lrate

# training_model,encoder_model, decoder_model = lstm_model(seq_len=30, context_length = context_vector_length, stateful=False)

### Train the model

First, we need to create a distribution strategy that can use the TPU. In this case it is TPUStrategy. You can create and compile the model inside its scope. Once that is done, future calls to the standard Keras methods `fit`, `evaluate` and `predict` use the TPU.

Again note that we train with `stateful=False` because while training, we only care about one batch at a time.

In [0]:
tf.keras.backend.clear_session()

resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)
tf.contrib.distribute.initialize_tpu_system(resolver)
strategy = tf.contrib.distribute.TPUStrategy(resolver)

with strategy.scope():
  training_model,encoder_model, decoder_model = lstm_model(seq_len=30, context_length = context_vector_length, stateful=False)
  lrate = tf.keras.callbacks.LearningRateScheduler(step_decay)
  adam = tf.keras.optimizers.RMSprop(lr=0.0, decay=0.0)
  # 'layer frozen'
  # for layer in training_model.layers[:-1]:
	#   layer.trainable = False
  training_model.compile(
      # optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.01),
      optimizer = 'adam',
      loss='sparse_categorical_crossentropy',
      metrics=['sparse_categorical_accuracy'])

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Initializing the TPU system: 10.98.176.234:8470
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Querying Tensorflow master (grpc://10.98.176.234:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 13823376781481859044)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 10440750302577450772)

In [0]:
# training_model.load_weights('/tmp/bard_{}.h5'.format(0))
  # 'load pretrained weights'
# encoder_model.load_weights('encoder.h5')
# decoder_model.load_weights('decoder.h5')

In [0]:
!mkdir weights
!mkdir models

In [10]:
training_model.fit(
    input_fn(),
    steps_per_epoch=50000,
    epochs=1,
    callbacks=[lrate]
)
saver_index = 0
# training_model.save_weights('weights/bard_{}.h5'.format(saver_index), overwrite=True)

# training_model.save('models/bard_{}.h5'.format(saver_index), overwrite=True)

training_model.save('/tmp/bard_{}.js.h5'.format(saver_index), overwrite=True)
saver_index += 100

100%|██████████| 29902401/29902401 [00:30<00:00, 979470.40it/s]
180232680it [01:15, 2375810.17it/s]


Processing the txt: Tensor("strided_slice:0", shape=(20,), dtype=int32), with label: Tensor("strided_slice_1:0", shape=(20,), dtype=int32)
--- slice tensor spends: 0.004944801330566406 seconds ---
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
lrate: 0.0006, epoch: 0


dropout 均为0.5时：
loss: 2.2302， sparse_categorical_accuracy: 0.6051

dropout 均为0.4时
loss: 2.1744 - sparse_categorical_accuracy: 0.6124

initial_lrate 为0.0002时

loss: 2.3910 - sparse_categorical_accuracy: 0.5846

# 删除EOS 和\n  &  分割 语句

In [0]:
def delete_EOS(input: list) -> list:
    while 'eos' in input:
        input.remove('eos')
    str1 = "".join(input)
    res = str1.split('\n')
    del res[-1]
    for single in res:
      print(single)
    return res  
# print(delete_EOS(['eos', '8', '8', '8', '8', '8', '8', '8', '8', '8', '8', '8', '8', '8', '\n', 'eos', '感谢', '观看', '\n', 'eos', '8', '8', '8', '8', '8', '8', '8', '8', '\n', 'eos', '8', '8', '8', '8', '8', '8', '8', '8', '8', '8', '\n', 'eos', '8', '8', '8', '8', '8', '8', '8', '8', '\n', 'eos', '8', '8', '8', '8', '8', '\n', 'eos', '感谢', '转播man', '\n', 'eos', '8', '8', '8', '8', '8', '8', '8', '\n', 'eos', '感谢', '转播man', '\n', 'eos', 'a', 'n', 't', 'i', '路过', '，', '不用', '管', '\n', 'eos', 'o', 'k', 'k', '\n', 'eos', 'a', 'n', 't', 'i', '需要', '理由', '吗', '？', '\n']
# ))

In [0]:
'get some real text inputs'
import random
import copy
# preprocessed_TXT = '/content/new_filtered_data.json'
def load_in_texts():
  with open(preprocessed_TXT, encoding='UTF-8') as json_file:
      data = json.load(json_file, encoding='UTF-8')
      'process the data'
      txt = []
      label_part = []
      for single_meg in tqdm(data):
        single_meg[0].insert(0, 'eos')
        single_meg[0].append('\n')
        label = [single_meg[1]] * len(single_meg[0])
        txt.extend(single_meg[0])
        label_part.extend(label)

      'remove that does not belongs to characters...'
      new_txt = []
      new_label_part = []
      for (single, label) in tqdm(zip(txt, label_part)):
        if single not in  ['成雨', '効']:
          new_txt.append(single)
          new_label_part.append(label)
      print("updated txt, remove from {} to {}, examples: {}".format(len(txt), len(new_txt), new_txt[:20]))
      return new_txt, new_label_part

In [0]:
# encoder_model.save_weights('/content/drive/My Drive/encoder.h5')
# decoder_model.save_weights('/content/drive/My Drive/decoder.h5')

encoder_model.save_weights('/tmp/encoder.h5')
decoder_model.save_weights('/tmp/decoder.h5')

### Make predictions with the model

Use the trained model to make predictions and generate your own fake danmaku messages.
Start the model off with a *seed* sentence, then generate 250 characters from it. The model makes five predictions from the initial seed.

The predictions are done on the CPU so the batch size (5) in this case does not have to be divisible by 8.

Note that when we are doing predictions or, to be more precise, text generation, we set `stateful=True` so that the model's state is kept between batches. If stateful is false, the model state is reset between each batch, and the model will only be able to use the information from the current batch (a single character) to make a prediction.

The output of the model is a set of probabilities for the next character (given the input so far). To build a paragraph, we predict one character at a time and sample a character (based on the probabilities provided by the model). For example, if the input character is "草" and the output probabilities are "草" (0.65), "哈" (0.30), others characters (0.05), then we allow our model to generate text other than just "草" and "哈"

In [14]:
!pip install pprint

Collecting pprint
  Downloading https://files.pythonhosted.org/packages/99/12/b6383259ef85c2b942ab9135f322c0dce83fdca8600d87122d2b0181451f/pprint-0.1.tar.gz
Building wheels for collected packages: pprint
  Building wheel for pprint (setup.py) ... [?25l[?25hdone
  Created wheel for pprint: filename=pprint-0.1-cp36-none-any.whl size=1250 sha256=68d89039ed3a24f09f68d97a7bbb1bcb1e84b474d03f3925e5b367aeaa9a7cbd
  Stored in directory: /root/.cache/pip/wheels/42/d4/c6/16a6495aecc1bda5d5857bd036efd50617789ba9bea4a05124
Successfully built pprint
Installing collected packages: pprint
Successfully installed pprint-0.1


In [0]:
# BATCH_SIZE = 2
# PREDICT_LEN = 30
# BEAM_SIZE = 10
# import pprint
# # Keras requires the batch size be specified ahead of time for stateful models.
# # We use a sequence length of 1, as we will be feeding in one character at a 
# # time and predicting the next character.
# # tf.keras.backend.clear_session()

# _, encoder_model, _ = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
# # encoder_model.load_weights('/content/drive/My Drive/encoder.h5')
# encoder_model.load_weights('/tmp/encoder.h5')

# decoder_model = get_stand_alone_decoder(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
# # decoder_model.load_weights('/content/drive/My Drive/decoder.h5')
# decoder_model.load_weights('/tmp/decoder.h5')

# # We seed the model with our initial string, copied BATCH_SIZE times
# # seed_txt = ['了', '\n', 'eos', '再次', '坑', '乌拉', '\n', 'eos', 'p', 'o', 'i', '和', '乌拉', '拉', '在', '一起', '打', '吗', '？', '\n', 'eos', '马赛克', '！', '\n', 'eos', '噗', '\n', 'eos', 'e', 'm', 'm', 'm', '火车', '晚', '了', 'p', 'o', 'i', '\n', 'eos', '？', '？', '？', '\n', 'eos', '？', '？', '？', '\n', 'eos', '你', '要', '知道', '我', '写', '过', '文', '\n', 'eos', '咬', '滑稽', '\n', 'eos', '这', '是', '一段', '有', '画面', '的', '文字', '\n', 'eos', '污', '拉拉', '\n', 'eos', '乌拉', '拉', '晚上', '好', '\n', 'eos', '去', '拿', '对面', '油', '\n', 'eos', '晚', '好', '吖', '\n', 'eos', '哇', '，', '乌拉', '粉丝', '还有', '看', '高', '达', '的', '\n', 'eos', '（', '6', '9', '）', '\n', 'eos', '我', '怀疑', '你', '在', '开车', '\n', 'eos', '尝试', '理解', 'p', 'o', 'i', '\n', 'eos', '好多', '.', '。', '。', '。', '\n', 'eos', '不要', '火车', 'p', 'o', 'i', '？', '\n', 'eos', '上次', '。', '。', '。', '\n', 'eos', '我', '看看', '我', '下', '个', 'c', 'o', 'h', '\n', 'eos', '穿', '模', '\n', 'eos', '？', '？', '？', '？', '？', '？', '？', '\n', 'eos', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '？', '\n', 'eos', '天狗', '\n']
# # seed_txt = ['eos','天', '狗','\n']*200

# print("Load in texts...")
# seed = transform(load_in_texts(100))
# # print(seed_txt)

# # seed = transform(load_in_texts(100))
# seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)
# # Encode the input as state vectors.
# state_and_output = encoder_model.predict(seed)
# states_value = [state_and_output[:4]] * BEAM_SIZE
# encoder_output = state_and_output[-1]

# # Solve decoder things
# last_predictions = [np.array([[7010]]*BATCH_SIZE, dtype=np.int32)]
# # Beam serach impl!
# 'at first, all prob is 0'
# path_saver = [[0, list()]] * BEAM_SIZE
# print("Preforming beam search...")
# for i in tqdm(range(PREDICT_LEN)):
#   total_slot = []
#   for beam_words_id in range(len(last_predictions)):
#     'for this words'
#     last_word = last_predictions[beam_words_id]
#     next_probits, h, c, h1, c1 = decoder_model.predict([last_word] + states_value[beam_words_id] + [encoder_output])
#     'assign right states value'
#     if len(last_predictions) == 1:
#       'at the beginning, just renew all the state values'
#       for i in range(BEAM_SIZE):
#         states_value[i] = [h, c, h1, c1]#######NOTICE THE ADDITIONAL HIDDEN STATES
#     else:
#       'if we have more choices, only update one'
#       states_value[beam_words_id] = [h, c, h1, c1]#######NOTICE THE ADDITIONAL HIDDEN STATES
#     batch_id = 0
#     next_probits = next_probits[:, 0, :][batch_id]
#     'for each batch'
#     'just a dirty work around, since all batch return the same results'
#     previous_prob = path_saver[beam_words_id]
#     top_k_words = next_probits.argsort()[-BEAM_SIZE:]
#     for words_id in top_k_words:
#       total_slot.append([previous_prob[0] + np.log(next_probits[words_id]), previous_prob[1] + [words_id]])
#   'sort by the first prob'
#   path_saver = sorted(total_slot, key=lambda tup:tup[0])[-BEAM_SIZE:]
#   last_predictions = []
  
#   'Do something to get last predictions work here'
#   for previous_path_tuple in path_saver:
#     last_path = previous_path_tuple[1][-1]
#     last_predictions.append(np.array([[last_path]]*BATCH_SIZE, dtype=np.int32))

# 'generate top k sentences'
# fin_res = []
# for single_path in path_saver:
#   prob = single_path[0]
#   sentence = []
#   for val in single_path[1]:
#     current_char = n_to_char[val]
#     sentence.append(current_char)
#     if current_char == '\n':
#       break
#   generated_sentence = ''.join(sentence)  # Convert back to text
#   fin_res.append([prob, generated_sentence])

# fin_res = sorted(fin_res, key=lambda tup:tup[0])
# pprint.pprint(fin_res)

In [0]:
BATCH_SIZE = 500
PREDICT_LEN = 15

# Keras requires the batch size be specified ahead of time for stateful models.
# We use a sequence length of 1, as we will be feeding in one character at a 
# time and predicting the next character.
# tf.keras.backend.clear_session()

_, encoder_model, _ = lstm_model(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
# prediction_model.load_weights('/tmp/bard_{}.h5'.format(0))
encoder_model.load_weights('/tmp/encoder.h5')

decoder_model = get_stand_alone_decoder(seq_len=1, batch_size=BATCH_SIZE, stateful=True)
decoder_model.load_weights('/tmp/decoder.h5')

In [17]:
# We seed the model with our initial string, copied BATCH_SIZE times

# seed_txt = ['那', '是', '你', '心态', '不行', '\n', 'eos', '我', '爱', '酱', '真是', '越来越', '聪明', '啦', '？', '\n', 'eos', '本子', '预定', '\n', 'eos', '被', '吓', '到', '了', '\n', 'eos', '哈哈哈', '哈哈哈', '\n', 'eos', '吹', '\n', 'eos', '字幕', '没错', '好', '吧', '\n', 'eos', '代', '打', '当然', '是', '开玩笑', '，', '但是', '说', '多', '了', '是', '真的', '烦', '\n', 'eos', '代', '打', '是', '不过', '分', '，', '但', '你', '一直', '刷', '，', '你', '不', '烦', '别人', '烦', '\n', 'eos', '整个', '游戏', '就', '在', '这儿', '卡', '关', '了', '不', '知道', '可以', '跳', '2', '3', '3', '3', '\n', 'eos', '1', '7', '秒', '\n', 'eos', 'w', 'w', 'w', 'w']
# seed_txt = ['嘛', '（', '清楚', '多', '意', '）', '\n', 'eos', '完事', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', '\n', 'eos', '很', '懂', '\n', 'eos', '自己', '都', '笑', '了', '\n', 'eos', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', '\n', 'eos', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', '\n', 'eos', '老鸨', '\n', 'eos', '很', '懂', '。', '。', '。', '我', '第一次', '知道', '\n', 'eos', 'j', 'k', '一周', '目', '草', '生', '\n', 'eos', '过于', '清楚', '\n', 'eos', '艾', '琳', '太', '真实', '了', '\n', 'eos', '实在', '是', '过于', '清楚', '\n', 'eos', '怎么', '知道', '的']
# seed = transform(seed_txt)
new_txt, new_label = load_in_texts()

100%|██████████| 29902401/29902401 [00:36<00:00, 824817.69it/s]
180232680it [01:12, 2477162.52it/s]


updated txt, remove from 180232680 to 180232680, examples: ['eos', '大米', '也', '回来', '了', '\n', 'eos', '.', '\n', 'eos', '待机', '\n', 'eos', '待机', '\n', 'eos', '来晚', '了', ' ', '来晚']


In [18]:
len(n_to_char)
# print(f"10817: {n_to_char[-2]}")
# print(f"10818: {n_to_char[-1]}")
print(n_to_char[10816])
print(n_to_char[10817])



eos


In [0]:
def clip_text(txt_length, new_txt, new_label, start_pos_type = 0):
    proposed_start_index = new_label.index(start_pos_type)
    'Currently, we only support 355 vtubers'
    proposed_end_index = new_label.index(min(start_pos_type+1, 355))
    start_index = random.randint(proposed_start_index, proposed_end_index - txt_length)
    clipped_txt_for_test = new_txt[start_index:start_index + txt_length]
    clipped_labels_for_test = new_label[start_index:start_index + txt_length]
    delete_EOS(copy.deepcopy(clipped_txt_for_test))
    return clipped_txt_for_test, clipped_labels_for_test[0]

In [0]:
# room_id_mapping = '/content/room_id_mapping.json'
# with open(room_id_mapping, encoding='UTF-8') as json_file:
#       id_mapping_dict = json.load(json_file, encoding='UTF-8')
# print(f"mapping_id_res: {id_mapping_dict}")

In [23]:
print("Load in texts...")
'283：茯苓猫不黑'
input_text, input_label = clip_text(100, new_txt, new_label, start_pos_type=70)
# print("the previous text is from: {}".format(id_mapping_dict[str(input_label)]))
seed = transform(input_text)
print("generating text...")
seed = np.repeat(np.expand_dims(seed, 0), BATCH_SIZE, axis=0)
label_seed = np.repeat(np.expand_dims(input_label, 0), BATCH_SIZE, axis=0)
state_and_output = encoder_model.predict((seed, label_seed))
states_value = state_and_output[:4]
encoder_output = state_and_output[-1]
# Solve decoder things, we just happen to have 10817 tokens here 
predictions = [np.array([[10817]]*BATCH_SIZE, dtype=np.int32)]
predictions_prob = []
for i in range(PREDICT_LEN):
  last_word = predictions[-1]
  next_probits, h, c, h1, c1 = decoder_model.predict([last_word] + states_value + [encoder_output])
  next_probits = next_probits[:, 0, :]
  # sample from our output distribution
  next_idx = [
      np.random.choice(len(characters), p=next_probits[i])
      # np.argmax(next_probits[i])
      for i in range(BATCH_SIZE)
  ]
  'build the prob case'
  prob = []
  for batch_id in range(BATCH_SIZE):
    prob.append(next_probits[batch_id][next_idx[batch_id]])
  predictions_prob.append(np.asarray(prob))
  predictions.append(np.asarray(next_idx, dtype=np.int32))
  # Update states
  states_value = [h, c, h1, c1]#######NOTICE THE ADDITIONAL HIDDEN STATES

generated_whole_list = []
for i in range(BATCH_SIZE):
  # print('PREDICTION %d\n\n' % i)
  p = [predictions[j][i] for j in range(PREDICT_LEN)]
  p_prob = [predictions_prob[j][i] for j in range(PREDICT_LEN)]
  current_list = []
  'one sentence for one batch'
  this_batch_prob = 0.
  for index in range(len(p)):
    'just get the character generated'
    val = p[index]
    cur_prob = np.log(p_prob[index])
    if index == 0:
      val = val[0]
    current_char = n_to_char[val]
    current_list.append(current_char)
    this_batch_prob += cur_prob
    if current_char == '\n':
      break
  'we also wanna the average prob here'
  this_batch_prob/=len(current_list)
  current_list.remove('eos')
  generated = ''.join(current_list)  # Convert back to text
  if generated != '\n':
    generated_whole_list.append([this_batch_prob, generated])
fin_res = sorted(generated_whole_list, key=lambda tup:tup[0], reverse=True)

for this_batch_prob, generated in fin_res:
  print("with prob: {}, generated: {}".format(this_batch_prob, generated))
# assert len(generated) == PREDICT_LEN, 'Generated text too short'

Load in texts...
\花丸/\花丸/\花丸/\花丸/
\花丸/\花丸/\花丸/\花丸/\花丸/\花丸/
ｋｋｓｋ
花丸最高
\花丸/\花丸/\花丸/\花丸/\花丸/
\花丸/\花丸/\花丸/\花丸/\花丸/\花丸/\花丸/
\花丸/\花丸/\花丸/\花丸/\花丸/
generating text...
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob: -0.05845086745248409, generated: \花丸/\花丸/\花丸/\花丸/\花丸
with prob

####Get the generated data & origin data for comparison

In [22]:
origin_whole_list = []

'get some origin data in a list'
with open(preprocessed_TXT, encoding='UTF-8') as json_file:
    data = json.load(json_file, encoding='UTF-8')

'randomly sample some data from the list'
index_range = np.arange(len(data))
np.random.shuffle(index_range)

'obtain the data'
for index in range(PREDICT_LEN):
  origin_data = ''.join(data[index_range[index]])
  origin_whole_list.append(origin_data)

TypeError: ignored

In [0]:
print("real_message samples: {}".format(origin_whole_list[:5]))
print("fake_messages samples: {}".format(generated_whole_list[:5]))

In [0]:
'with pandas data frame'
import pandas as pd
from sklearn.utils import shuffle
df = pd.DataFrame(columns=['message', 'label'])
for origin in origin_whole_list:
  df = df.append({'message': origin, 'label': 1}, ignore_index=True)
for fake in generated_whole_list:
  df = df.append({'message': fake, 'label': 0}, ignore_index=True)
df = shuffle(df)
df.to_csv("/content/fake_danmaku_evaluation.csv")



## What's next

* Danmaku-caption: generate danmaku based on context


### For TS-Javascript
Load and save

In [0]:
# prediction_model = lstm_model(seq_len=1,batch_size=8, stateful=True)
# prediction_model.load_weights('/tmp/bard_{}.h5'.format(0))
# prediction_model.save('/tmp/bard.js.h5')