In [None]:
import tensorflow

print(tensorflow.__version__)

2.4.1


# Ref & Quicklink: 

- [Text Summarize](https://towardsdatascience.com/text-summarization-from-scratch-using-encoder-decoder-network-with-attention-in-keras-5fa80d12710e)
- [thaisum](https://huggingface.co/datasets/thaisum)
- [code Medium](https://gist.github.com/VarunSaravanan)
- [Attention](https://github.com/thushv89/attention_keras)
- [colab](https://colab.research.google.com/drive/1euy7-fIJwTjoP26adlkr5o_da9_GFqFp)
- [Google Drive](https://drive.google.com/open?id=15dfGrjIZ9W4jfHgku0cUl_mmqqdt_xde&authuser=6031020321%40student.chula.ac.th&usp=drive_fs)

In [None]:
load_pre = True
load_weight = True
plot_image = False

In [None]:
!nvidia-smi

Sun May  2 03:37:05 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Preprocessing

In [None]:
!pip install pythainlp



In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

## Download data

In [None]:
if load_pre :
  # load from prepocess
  #Train data
  !gdown --id 1GfPMhYq9kXGwOMqx_tJzIIxcg061-pMQ
  #Test data
  !gdown --id 15e8MiMNNhgm16v4qlm19dx9bubS7gAmw
  #Validation data
  !gdown --id 15f9CcEICmw3o56cOac14N_mZ1pWMCiwG
else :
  #Train data
  !gdown --id 1jhXHwN6oYnGnWlzyKl12PTO3WSCFGJX6
  #Test data
  !gdown --id 1-08jI8lJZdQQa8XBQKjahx2jwygUFGto
  #Validation data
  !gdown --id 1-0KeeB8J770e5-DazCwRPbpNjgKlPmG3

Downloading...
From: https://drive.google.com/uc?id=1GfPMhYq9kXGwOMqx_tJzIIxcg061-pMQ
To: /content/train_token.csv
18.7MB [00:00, 87.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=15e8MiMNNhgm16v4qlm19dx9bubS7gAmw
To: /content/test_token.csv
10.2MB [00:00, 89.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=15f9CcEICmw3o56cOac14N_mZ1pWMCiwG
To: /content/val_token.csv
10.2MB [00:00, 89.8MB/s]


In [None]:
SAVE_DIRECTORY = '/content'

if load_pre :
  df_train = pd.read_csv(f'{SAVE_DIRECTORY}/train_token.csv',encoding='utf-8-sig', nrows=20000) # read only first 20k rows
  df_test = pd.read_csv(f'{SAVE_DIRECTORY}/test_token.csv',encoding='utf-8-sig')
  df_validation = pd.read_csv(f'{SAVE_DIRECTORY}/val_token.csv',encoding='utf-8-sig')
else :
  df_train = pd.read_csv(f'{SAVE_DIRECTORY}/thaisum_train.csv',encoding='utf-8-sig', nrows=20000)
  df_test = pd.read_csv(f'{SAVE_DIRECTORY}/thaisum_test.csv',encoding='utf-8-sig')
  df_validation = pd.read_csv(f'{SAVE_DIRECTORY}/thaisum_validation.csv',encoding='utf-8-sig')

## Preprocess

Now we need to clean our text, we perform the following steps for the text 
and headlines pair:

- Remove extra white spaces
- Expand contractions
- Remove special characters
- Lowercase all texts


In [None]:
# Select only summary and title
if not load_pre:
  df_train_clean = df_train[["summary", "title"]]
  df_test_clean = df_test[["summary", "title"]]
  df_validation_clean = df_validation[["summary", "title"]]

In [None]:
# Remove special characters
if not load_pre:
  from unicodedata import normalize

  # n space -> 1 space
  # lowercase 
  def cleanInput(sentence):
    sentence = normalize("NFKD", sentence.strip().lower())
    sentence = " ".join(sentence.split())
    return sentence

  df_train_clean["summary"] = df_train_clean["summary"].apply(cleanInput)
  df_train_clean["title"] = df_train_clean["title"].apply(cleanInput)
  df_test_clean["summary"] = df_test_clean["summary"].apply(cleanInput)
  df_test_clean["title"] = df_test_clean["title"].apply(cleanInput)
  df_validation_clean["summary"] = df_validation_clean["summary"].apply(cleanInput)
  df_validation_clean["title"] = df_validation_clean["title"].apply(cleanInput)

- add start and end tokens 
-  help us to get an overall idea about the distribution of length of the text. This will help us fix the maximum length of the sequence
- Train test split

In [None]:
# tokenize summary and title
if not load_pre:
  from pythainlp.tokenize import word_tokenize

  # split number
  def isNum(word):
    return word.replace(",", "").replace(".", "").isnumeric()

  def clearAfterToken(sentence):
    newSentence = []
    for word in sentence:
      word = word.strip()
      if isNum(word):
        word = "~".join(list(word))
      
      word = word.replace("(", "~(~").replace(")", "~)~").replace("–", "-"). replace("-", "~-~").replace("?", "~?~")
      word = word.replace("“",'~"~').replace("”",'~"~')
      word = word.replace("‘","~'~").replace("’","~'~")
      word = word.strip().split("~")
      newSentence += word
    return newSentence


  for i in tqdm(range(df_train.shape[0])):
    df_train_clean["summary"][i] = clearAfterToken(word_tokenize(df_train_clean["summary"][i], engine="newmm"))
    df_train_clean["title"][i] = clearAfterToken(word_tokenize(df_train_clean["title"][i], engine="newmm"))

  for i in tqdm(range(df_test.shape[0])):
    df_test_clean["summary"][i] = clearAfterToken(word_tokenize(df_test_clean["summary"][i], engine="newmm"))
    df_test_clean["title"][i] = clearAfterToken(word_tokenize(df_test_clean["title"][i], engine="newmm"))

  for i in tqdm(range(df_validation.shape[0])):
    df_validation_clean["summary"][i] = clearAfterToken(word_tokenize(df_validation_clean["summary"][i], engine="newmm"))
    df_validation_clean["title"][i] = clearAfterToken(word_tokenize(df_validation_clean["title"][i], engine="newmm"))

In [None]:
if not load_pre:
  df_train_clean.to_csv("train_token.csv", index=False)
  df_test_clean.to_csv("test_token.csv", index=False)
  df_validation_clean.to_csv("val_token.csv", index=False)

  from google.colab import files
  files.download("train_token.csv") 
  files.download("test_token.csv") 
  files.download("val_token.csv") 

In [None]:
# handle pre-preprocess data 
if load_pre:
  df_train_clean = df_train.copy()
  df_test_clean = df_test.copy()
  df_validation_clean = df_validation.copy()

  df_train_clean["summary"] = df_train["summary"].apply(lambda x: x[2:-2].split("', '"))
  df_train_clean["title"] = df_train["title"].apply(lambda x: x[2:-2].split("', '"))
  df_test_clean["summary"] = df_test["summary"].apply(lambda x: x[2:-2].split("', '"))
  df_test_clean["title"] = df_test["title"].apply(lambda x: x[2:-2].split("', '"))
  df_validation_clean["summary"] = df_validation["summary"].apply(lambda x: x[2:-2].split("', '"))
  df_validation_clean["title"] = df_validation["title"].apply(lambda x: x[2:-2].split("', '"))

In [None]:
# add start and end token
START_TOKEN = "<s>"
END_TOKEN = "</s>"
UNK_TOKEN = "UNK"

df_train_clean["title"] = df_train_clean["title"].apply(lambda x: [START_TOKEN] + x + [END_TOKEN])
df_test_clean["title"] = df_test_clean["title"].apply(lambda x: [START_TOKEN] + x + [END_TOKEN])
df_validation_clean["title"] = df_validation_clean["title"].apply(lambda x: [START_TOKEN] + x + [END_TOKEN])

In [None]:
def wordMap(sentence_list):
  word2idx = {}
  word2idx[UNK_TOKEN] = 1
  word2idx[START_TOKEN] = 2
  word2idx[END_TOKEN] = 3

  for sentence in sentence_list:
    for word in sentence:
      if word not in word2idx:
        word2idx[word] = len(word2idx) + 1
  
  idx2word = {v: k for k, v in word2idx.items()}
  return word2idx, idx2word

word2idx, idx2word = wordMap(df_train_clean["summary"].tolist() + df_train_clean["title"].tolist())

df_train_clean["summary"] = df_train_clean["summary"].apply(lambda x: [word2idx.get(word, 1) for word in x])
df_train_clean["title"] = df_train_clean["title"].apply(lambda x: [word2idx.get(word, 1) for word in x])
df_test_clean["summary"] = df_test_clean["summary"].apply(lambda x: [word2idx.get(word, 1) for word in x])
df_test_clean["title"] = df_test_clean["title"].apply(lambda x: [word2idx.get(word, 1) for word in x])
df_validation_clean["summary"] = df_validation_clean["summary"].apply(lambda x: [word2idx.get(word, 1) for word in x])
df_validation_clean["title"] = df_validation_clean["title"].apply(lambda x: [word2idx.get(word, 1) for word in x])

In [None]:
maxlen_summary = max([len(x) for x in df_train_clean["summary"]])
maxlen_title = max([len(x) for x in df_train_clean["title"]])
print("maxlen_summary: ", maxlen_summary)
print("maxlen_title: ", maxlen_title)

maxlen_summary:  439
maxlen_title:  48


In [None]:
# padding
X_train = pad_sequences(df_train_clean["summary"], maxlen=maxlen_summary, dtype='int32', padding='post', truncating='post',value=0)
y_train = pad_sequences(df_train_clean["title"], maxlen=maxlen_title, dtype='int32', padding='post', truncating='post',value=0)
X_test = pad_sequences(df_test_clean["summary"], maxlen=maxlen_summary, dtype='int32', padding='post', truncating='post',value=0)
y_test = pad_sequences(df_test_clean["title"], maxlen=maxlen_title, dtype='int32', padding='post', truncating='post',value=0)
X_val = pad_sequences(df_validation_clean["summary"], maxlen=maxlen_summary, dtype='int32', padding='post', truncating='post',value=0)
y_val = pad_sequences(df_validation_clean["title"], maxlen=maxlen_title, dtype='int32', padding='post', truncating='post',value=0)

# Model

In [None]:
%tensorflow_version 1.15
import tensorflow as tf
import re           
import os
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords   
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

import nltk
import os
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1.15`. This will be interpreted as: `1.x`.


TensorFlow is already loaded. Please restart the runtime to change versions.


##Attention

In [None]:
class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_out_seq, self.W_a)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>', U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)
            if verbose:
                print('Ws+Uh>', Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

## Embedding

In [None]:
from pythainlp import word_vector

In [None]:
word_vector_model = word_vector.get_model()
embedding_weights = np.zeros([len(word2idx) + 1, 300])
for word, i in word2idx.items():
  try:
    embedding_weights[i] = word_vector_model[word]
  except:
    pass

## Model

In [None]:
MODEL_NAME = 'model_best_weights_news.h5'
if load_weight :
  !gdown --id 15fv8gMmnSQUCifKKZ7RNgDeTRxwEGD_p

Downloading...
From: https://drive.google.com/uc?id=15fv8gMmnSQUCifKKZ7RNgDeTRxwEGD_p
To: /content/model_best_weights_news.h5
424MB [00:02, 205MB/s]


In [None]:
K.clear_session() 
latent_dim = 500
x_voc_size = y_voc_size = len(word2idx) + 1

# Encoder 
encoder_inputs = Input(shape=(maxlen_summary,)) 
enc_emb = Embedding(x_voc_size,300,weights=[embedding_weights],input_length=maxlen_summary,trainable=True)(encoder_inputs) 

#LSTM 1 
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) 

#LSTM 2 
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

#LSTM 3 
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2) 

# Set up the decoder. 
decoder_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(x_voc_size,300,weights=[embedding_weights],input_length=maxlen_summary,trainable=True,) 

dec_emb = dec_emb_layer(decoder_inputs) 

#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c]) 

#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#Dense layer
decoder_dense = TimeDistributed(Dense(y_voc_size, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input) 

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 439)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 439, 300)     8483100     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 439, 500), ( 1602000     embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [None]:
if load_weight:
  model.load_weights(MODEL_NAME)

In [None]:
if plot_image:
  tf.keras.utils.plot_model(
    model, to_file='model.png', show_shapes=False, show_layer_names=True,
    rankdir='TB', expand_nested=False, dpi=96
  )

# Train

In [None]:
if not load_weight:
  es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
  checkpoint = ModelCheckpoint(MODEL_NAME, monitor='val_loss', verbose=1, save_best_only=True, mode='min', period=1)

In [None]:
if not load_weight:
  val_data = ([X_val,y_val[:, :-1]], y_val[:, 1:])
  history=model.fit([X_train, y_train[:, :-1]],
                    y_train[:, 1:],
                    epochs = 30,
                    batch_size=64,
                    validation_data = val_data,
                    callbacks = [es,checkpoint])

In [None]:
if not load_weight:
  #download
  from google.colab import files
  files.download(MODEL_NAME) 

In [None]:
if not load_weight:
  from matplotlib import pyplot
  pyplot.plot(history.history['loss'], label='train')
  pyplot.plot(history.history['val_loss'], label='test') #might have error
  pyplot.legend()
  pyplot.show()

# Inference

In [None]:
# encoder inference
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# decoder inference
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(maxlen_summary,latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

#attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat)

# Final decoder model
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])

In [None]:
if plot_image:
  tf.keras.utils.plot_model(
    encoder_model, to_file='encoder_model.png', show_shapes=False, show_layer_names=True,
    rankdir='TB', expand_nested=False, dpi=96
  )

In [None]:
if plot_image:
  tf.keras.utils.plot_model(
    decoder_model, to_file='decoder_model.png', show_shapes=False, show_layer_names=True,
    rankdir='TB', expand_nested=False, dpi=96
  )

In [None]:
encoder_model.save('encoder')
decoder_model.save('decoder')



INFO:tensorflow:Assets written to: encoder/assets


INFO:tensorflow:Assets written to: encoder/assets


INFO:tensorflow:Assets written to: decoder/assets


INFO:tensorflow:Assets written to: decoder/assets


In [None]:
from tensorflow.keras.models import load_model

# Test loading model
test_model = load_model('encoder')





In [None]:
!zip -r encoder.zip encoder/
!zip -r decoder.zip decoder/

  adding: encoder/ (stored 0%)
  adding: encoder/saved_model.pb (deflated 91%)
  adding: encoder/assets/ (stored 0%)
  adding: encoder/variables/ (stored 0%)
  adding: encoder/variables/variables.data-00000-of-00001 (deflated 8%)
  adding: encoder/variables/variables.index (deflated 52%)
  adding: decoder/ (stored 0%)
  adding: decoder/saved_model.pb (deflated 89%)
  adding: decoder/assets/ (stored 0%)
  adding: decoder/variables/ (stored 0%)
  adding: decoder/variables/variables.data-00000-of-00001 (deflated 12%)
  adding: decoder/variables/variables.index (deflated 45%)


In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = word2idx[START_TOKEN]

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = idx2word[sampled_token_index]
        
        if(sampled_token!=END_TOKEN):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == END_TOKEN  or len(decoded_sentence.split()) >= (maxlen_title-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
def seqIdx2text(input_seq):
    newString=''
    for i in input_seq:
      if(i > word2idx[END_TOKEN]):
        newString=newString + idx2word[i] + ' '
    return newString

In [None]:
for i in range(100, 110):
  test_x = X_test[i:i+1]
  test_y = y_test[i:i+1]
  result = model.predict([test_x, test_y])
  print("Actual headline:", seqIdx2text(test_y[0]))
  print("Actual content:", seqIdx2text(test_x[0]))
  print("Predicted headline (Inference):",decode_sequence(test_x.reshape(1, maxlen_summary)))
  print("Predicted headline (Train):", seqIdx2text(np.argmax(result[0], axis=1)))
  print("--------------------------------------------------------------------------")

Actual headline: ไทย คว้าแชมป์ ตะกร้อ ซูเปอร์ ซีรี่ส์ ที่ สิงคโปร์ 
Actual content: ทีม ชาติ ไทย คว้าแชมป์ การแข่งขัน ตะกร้อ  รายการ อิส  ซูเปอร์ ซีรี่ส์  ประเภท ชาย  และ หญิง  ที่ ประเทศ สิงคโปร์ 
Predicted headline (Inference):  ช้าง ศึก  -  1 9  ซ้อม  ลุย ศึก  ชิง แชมป์โลก  2  -  2  เซต  ชิง แชมป์โลก  2  ทีม
Predicted headline (Train): ช้าง       
--------------------------------------------------------------------------
Actual headline: บัว  นลิน ทิพย์   -    ธรรม ์ ธัช  ซุ่ม ปลูก ต้น รัก ใน กอง ละคร  สุด หวาน 
Actual content: เอี๊ยด  เอี๊ยด ด  เป็น การบ่ม ความรัก แบบ เงียบๆ  ระหว่าง  บัว  -  นลิน ทิพย์  กับ พระเอก น้องใหม่   -  ธรรม ์ ธัช  ที่ เจอะ เจอกัน ใน กอง ละคร  คุณแม่ สวมรอย  งาน นี้ เลย แอบ ปลูก ต้น รัก กัน ไปมา  ไม่ต้อง สวมรอย ใครๆ 
Predicted headline (Inference):  ปอ  -  เขิน  -  ท้อง  -  ท้อง  -  ท้อง  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก  -  รัก
Predicted headline 

#Evaluation

In [None]:
!pip install rouge

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0


In [None]:
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

In [None]:
def evaluate_rouge_score(hypothesis, reference): 
  rouge = Rouge()
  avg_scores = rouge.get_scores(hypothesis, reference, avg=True)
  return avg_scores

In [None]:
def evaluate_bleu_score(hypothesis, reference):
  hypothesis = [e.split() for e in hypothesis]
  reference = [[e.split()] for e in reference]
  avg_scores = {}
  smoothie = SmoothingFunction().method4
  avg_scores["bleu-1"] = corpus_bleu(reference, hypothesis, weights=(1,0,0,0), smoothing_function=smoothie)
  avg_scores["bleu-2"] = corpus_bleu(reference, hypothesis, weights=(0,1,0,0), smoothing_function=smoothie)
  avg_scores["cumulative"] = corpus_bleu(reference, hypothesis, smoothing_function=smoothie)
  return avg_scores

In [None]:
def evaluate(X, y):
  predictions = []
  references = []
  for i in tqdm(range(X.shape[0]), position=0):
    predictions.append(decode_sequence(X[i].reshape(1, maxlen_summary)))
    references.append(seqIdx2text(y[i]))
  return evaluate_rouge_score(predictions, references), evaluate_bleu_score(predictions, references)

In [None]:
rouge_scores, bleu_scores = evaluate(X_test[:100], y_test[:100])

100%|██████████| 100/100 [02:44<00:00,  1.65s/it]


In [None]:
print(rouge_scores)
print(bleu_scores)

{'rouge-1': {'f': 0.07293421168901013, 'p': 0.07144304382161684, 'r': 0.08653017398837831}, 'rouge-2': {'f': 0.01590428308985544, 'p': 0.01661266079765563, 'r': 0.01777729608936703}, 'rouge-l': {'f': 0.08717951230394273, 'p': 0.1077745613949174, 'r': 0.07939863722448862}}
{'bleu-1': 0.050191407911527, 'bleu-2': 0.010217681030653042, 'cumulative': 0.006351494422156746}


In [None]:
# Download words
!gdown --id 1NNUDoxFKEM43MAyPx2Fz7coDVoJuo2RI

Downloading...
From: https://drive.google.com/uc?id=1NNUDoxFKEM43MAyPx2Fz7coDVoJuo2RI
To: /content/word.txt
  0% 0.00/548k [00:00<?, ?B/s]100% 548k/548k [00:00<00:00, 8.60MB/s]


In [None]:

def loadDict():
  word2idx = {}
  with open("word.txt", "r") as f:
    for line in f.readlines():
      word2idx[line.replace("\n","")] = len(word2idx) + 1

  idx2word = {v: k for k, v in word2idx.items()}
  return word2idx, idx2word

#Generate title

In [None]:
# Format input function
# Param sentence :string
# Return tokenized sentence :list<string>
def formatInput(sentence):  
    from unicodedata import normalize
    from pythainlp.tokenize import word_tokenize
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    START_TOKEN = "<s>"
    END_TOKEN = "</s>"
    UNK_TOKEN = "UNK"
    MAXLEN_INPUT = 439

    def cleanInput(sen):
        sen = normalize("NFKD", sen.strip().lower())
        sen = " ".join(sen.split())
        return sen

    # split number
    def isNum(word):
        return word.replace(",", "").replace(".", "").isnumeric()

    def clearAfterToken(sen):
        newSentence = []
        for word in sen:
            word = word.strip()
            if isNum(word):
              word = "~".join(list(word))
            
            word = word.replace("(", "~(~").replace(")", "~)~").replace("–", "-"). replace("-", "~-~").replace("?", "~?~")
            word = word.replace("“",'~"~').replace("”",'~"~')
            word = word.replace("‘","~'~").replace("’","~'~")
            word = word.strip().split('~')
            newSentence += word
        return newSentence

    def preprocessForKeras(sen):
      sen = [START_TOKEN] + sen + [END_TOKEN]
      word2idx, _ = loadDict()
      sen = [word2idx.get(word, 1) for word in sen]
      sen = pad_sequences([sen], maxlen=MAXLEN_INPUT, dtype='int32', padding='post', truncating='post',value=0)
      sen = sen.reshape(1, MAXLEN_INPUT)
      return sen

    return preprocessForKeras(clearAfterToken(word_tokenize(cleanInput(sentence), engine="newmm")))

In [None]:
input = formatInput("ปัตตานีตื่นทั้งตลาด เหตุ จนท.ติดเชื้อโควิด-19 จากการสัมผัสผู้ป่วยไปสถานบันเทิงแล้วมาช่วยแม่ขายของ พ่อค้าแม่ค้ารู้ข่าวต่างผวาระดมกำลังทำความสะอาดใหญ่ ด้าน ผวจ.สั่งตั้งด่านคัดกรองคนเข้า-ออก")
prediction = decode_sequence(input)
print(prediction)

 พบ โควิด  -  1 9  ราย  พบ ผู้ป่วย โควิด  -  1 9  ราย  พบ ติดเชื้อ  9  ราย  พบ ติดเชื้อ  6  ราย
