This notebook includes code to build and train model 3 (model with stacked Bi-LSTM encoder and LSTM decoder with Bahdanau Attention using Glove embeddings)

I have consulted and adapted code from the following sources:
- A. Pai, “Text Summarization: Text Summarization Using Deep Learning”, 2020 Analytics Vidhya. [Online]. Available: https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/. [Accessed: 21-Apr-2021]. 
- F. Chollet, “Keras documentation: Using pre-trained word embeddings,” Keras, 2020, [Online]. Available: https://keras.io/examples/nlp/pretrained_word_embeddings/. [Accessed: 27-May-2021]. 
- “Neural machine translation with attention,” TensorFlow. [Online]. Available: https://www.tensorflow.org/tutorials/text/nmt_with_attention. [Accessed: 27-May-2021]. 

In [None]:
import numpy as np  
import pandas as pd 
import tensorflow as tf
import keras
import re           
from bs4 import BeautifulSoup 
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords   
from tensorflow.keras.layers import Input, LSTM, Bidirectional, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")
import lxml
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
drive_folder_path = "drive/MyDrive/Vassar/Junior Year/Spring/CMPU-366/Final Project"

In [None]:
model_folder_name = "BiLSTM_model_5_glove_128_latent_200_emb_128_batch_10_epochs"

In [None]:
# !cp ./drive/MyDrive/Vassar/Junior\ Year/Spring/CMPU-366/Final\ Project/articles_signal_media_100k.json .
!cp ./drive/MyDrive/Vassar/Junior\ Year/Spring/CMPU-366/Final\ Project/articles_2.json .
!cp -r ./drive/MyDrive/Vassar/Junior\ Year/Spring/CMPU-366/Final\ Project/glove6B .

In [None]:
!cp ./drive/MyDrive/Vassar/Junior\ Year/Spring/CMPU-366/Final\ Project/attention.py .


In [None]:
from attention import AttentionLayer

# Load data and Preliminaries

In [None]:
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('words')
words = set(nltk.corpus.words.words())

stop_words = set(stopwords.words('english')) 

In [None]:
# reference: https://www.tensorflow.org/tutorials/text/nmt_with_attention

# Converts the unicode file to ascii
import unicodedata

def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                 if unicodedata.category(c) != 'Mn')
  
def preprocess_text(text, is_article):
  text = unicode_to_ascii(text.lower().strip())

  # remove url
  text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", text)

  # remove all words containing forward slashes and backslashes
  text = re.sub(r'\s*(?:[\w_]*[/\\](?:[\w_]*[/\\])*[\w_]*)', "", text)
  text = re.sub(u"(\u2018|\u2019)", "'", text) 
  # text = re.sub(r"\\\w+", "", text)
  
  # remove ordinals:
  text = re.sub(r'(?<=[0-9])(?:st|nd|rd|th)', "", text)
  
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  
  text = ' '.join([contraction_dict[t] if t in contraction_dict else t for t in text.split(" ")])    
  text = re.sub(r"([?.!,¿])", r" \1 ", text)
  text = re.sub(r'[" "]+', " ", text)

  text = re.sub(r"'s\b","",text)  
  
  text = text.replace("'", "")
  
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)

  if is_article:
    word_tokens = [word for word in text.split() if not word in stop_words]
  else: # not remove stop words from summary
    word_tokens = text.split()

  processed_text_tokens = []
  for word_token in word_tokens:
    if len(word_token) > 1:
      processed_text_tokens.append(word_token)

  text = (" ".join(processed_text_tokens))
  text = text.strip()

  return text

In [None]:
article_list = []
title_list = []

import json 

with open('articles_2.json') as f:
  data = json.load(f)

for json_object in data:
  article = json_object['content']
  article = preprocess_text(article, is_article = True)
  article_list.append(article)

  title = json_object['title']
  title = preprocess_text(title, is_article = False)
  title = 'sostoken ' + title + ' eostoken'
  title_list.append(title)

In [None]:
import matplotlib.pyplot as plt
article_word_count = []
title_word_count = []

# populate the lists with sentence lengths
for full_article in full_article_list:
  article_word_count.append(len(full_article.split()))

for title in title_list:
  title_word_count.append(len(title.split()))

length_df = pd.DataFrame({'full_article': article_word_count, 'title':title_word_count})
length_df.hist(bins = 30)
plt.show()

In [None]:
print(length_df['full_article'].values)

for i in range(0,100,10):
  full_article_lens = length_df['full_article'].values
  full_article_lens = np.sort(full_article_lens, axis = None)
  cur_percentile_len = full_article_lens[int(len(full_article_lens) * float(i)/100)]
  print("{} percentile full text length is {}".format(i, cur_percentile_len))

In [None]:
max_len_article = 512
max_len_title = 25

Select the texts and summary whose length is less than `max_len_article` and `max_len_title` title

In [None]:
min_len_article = 50
min_len_title = 5

In [None]:
short_article_list = []
short_title_list = []
for i in range(len(article_list)):
  if (len(article_list[i].split()) <= min_len_article
          and len(title_list[i].split()) <= min_len_title):
    continue
  if (len(article_list[i].split()) <= max_len_article
          and len(title_list[i].split()) <= max_len_title):
    short_article_list.append(article_list[i])
    short_title_list.append(title_list[i])


In [None]:
df = pd.DataFrame({'article': short_article_list, 'title': short_title_list}) 
df['title']

In [None]:
print(len(article_list), len(title_list))
print(len(short_article_list), len(short_title_list))

# Tokenize

In [None]:
def tokenize(text_train, text_validate, max_len_text):
    text_tokenizer = Tokenizer()
    text_tokenizer.fit_on_texts(list(text_train))

    #convert text sequences into integer sequences
    text_train = text_tokenizer.texts_to_sequences(text_train)
    text_validate = text_tokenizer.texts_to_sequences(text_validate)

    # max_len_text = max(len(text) for text in text_train)

    #padding zero upto maximum length
    text_train = pad_sequences(text_train, maxlen=max_len_text, padding='post')
    text_validate = pad_sequences(text_validate, maxlen=max_len_text, padding='post')

    text_vocab_size = len(text_tokenizer.word_index) + 1

    return text_tokenizer, text_train, text_validate, text_vocab_size


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_validate, y_train, y_validate=train_test_split(np.array(df['article']),np.array(df['title']),test_size=0.1,random_state=0,shuffle=True)

In [None]:
# Tokenize article
x_tokenizer, x_train, x_validate, x_vocab_size = tokenize(x_train, x_validate, max_len_article)

In [None]:
x_vocab_size

In [None]:
# Tokenize title
y_tokenizer, y_train, y_validate, y_vocab_size = tokenize(y_train, y_validate, max_len_title)

In [None]:
y_tokenizer.word_counts['sostoken'],len(y_train)

Delete the rows that only contains the `<start>` and `<end>` token

In [None]:
def get_rows_with_only_start_end(data):
    indices = []

    for i in range(len(data)):
        count = 0
        for j in data[i]:
            if j != 0:
                count += 1
        if count == 2:
            indices.append(i)
            
    return indices
    

In [None]:
train_row_with_only_start_end = get_rows_with_only_start_end(y_train)

y_train = np.delete(y_train, train_row_with_only_start_end, axis=0)
x_train = np.delete(x_train, train_row_with_only_start_end, axis=0)

In [None]:
validate_row_with_only_start_end = get_rows_with_only_start_end(y_validate)

y_validate = np.delete(y_validate, validate_row_with_only_start_end, axis=0)
x_validate = np.delete(x_validate, validate_row_with_only_start_end, axis=0)

In [None]:
np.save("{}/training_data_nd_array/{}/x_train.npy".format(drive_folder_path, model_folder_name), x_train)
np.save("{}/training_data_nd_array/{}/y_train.npy".format(drive_folder_path, model_folder_name), y_train)
np.save("{}/training_data_nd_array/{}/x_validate.npy".format(drive_folder_path, model_folder_name), x_validate)
np.save("{}/training_data_nd_array/{}/y_validate.npy".format(drive_folder_path, model_folder_name), y_validate)

# Glove Embeddings

In [None]:
embeddings_dict = {}

with open('glove6B/glove.6B.200d.txt', 'r') as glove_file:
  for line in glove_file:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_dict[word] = coefs

print("Found %s word vectors." % len(embeddings_dict))


In [None]:
embedding_matrix = np.zeros((x_vocab_size, 200))

hits = 0
misses = 0

for word, i in x_tokenizer.word_index.items():
  embedding_vector = embeddings_dict.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
    hits += 1
  else: 
    misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
np.save("{}/embeddings/{}/glove_emb.npy".format(drive_folder_path, model_folder_name), embedding_matrix)

# Build Model

In [None]:
from keras import backend as K

K.clear_session()

latent_dim = 128
embedding_dim = 200

# define encoder input
encoder_inputs = Input(shape=(max_len_full_text, ))

# define encoder embedding layer
# encoder_embedding_layer = Embedding(x_vocab_size, embedding_dim, trainable=True)
encoder_embedding_layer = Embedding(x_vocab_size, embedding_dim, trainable=False)
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_embedding_layer.set_weights([embedding_matrix])


# define 1st encoder lstm layer
encoder_lstm_1 = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout = 0.4))
encoder_output_1, state_forward_h1, state_forward_c1, state_backward_h1, state_backward_c1 = encoder_lstm_1(encoder_embedding)

# define 2nd encoder lstm layer
encoder_lstm_2 = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout = 0.4))
encoder_output_2, state_forward_h2, state_forward_c2, state_backward_h2, state_backward_c2 = encoder_lstm_2(encoder_output_1)

# define 3rd encoder lstm layer
encoder_lstm_3 = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout = 0.4))
encoder_outputs, state_forward_h, state_forward_c, state_backward_h, state_backward_c  = encoder_lstm_3(encoder_output_2)

state_h = Concatenate()([state_forward_h, state_backward_h])
state_c = Concatenate()([state_forward_c, state_backward_c])
encoder_states = [state_h, state_c]


# define decoder input 
decoder_inputs = Input(shape=(None, ))

# define decoder embedding layer
# decoder_embedding_layer = Embedding(y_vocab_size, embedding_dim, trainable=True)
decoder_embedding_layer = Embedding(x_vocab_size, embedding_dim, trainable=False)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_embedding_layer.set_weights([embedding_matrix])
 
# define decoder lstm layer
decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout = 0.2)
# decoder_input_states = [state_h, state_c]
decoder_input_state = encoder_states
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state = decoder_input_state)

# implement the Attention layer
attention_layer = AttentionLayer(name='attention_layer')
attention_input_states = [encoder_outputs, decoder_outputs]
attention_output, attention_states = attention_layer(attention_input_states)

# attention_layer = tf.keras.layers.AdditiveAttention()
# attention_output = attention_layer(attention_input_states)
# print(attention_output)
# print(decoder_outputs)

# define concatenate layer
concatenate_layer = Concatenate(axis=-1, name='concat_layer')

# define layer of concatenated attention output and decoder lstm outputs
decoder_concatenated_layer = concatenate_layer([decoder_outputs, attention_output])

# define the dense layer
decoder_dense = TimeDistributed(Dense(y_vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concatenated_layer)

# define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
import tensorflow as tf
gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
session = tf.compat.v1.InteractiveSession(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', mode = 'min', patience=2, verbose=1)

In [None]:
history = model.fit(
    x = [x_train, y_train[:, :-1]],
    y = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:, 1:],
    epochs = 10,
    callbacks=[early_stopping],
    batch_size = 128,
    validation_data=([x_validate,y_validate[:,:-1]], y_validate.reshape(y_validate.shape[0],y_validate.shape[1], 1)[:,1:])
  )

In [None]:
df_history = pd.DataFrame(history.history)

# or save to csv: 
history_csv_file = "{}/history/{}.csv".format(drive_folder_path, model_folder_name)

with open(history_csv_file, mode='w') as csv_file:
    df_history.to_csv(csv_file)

In [None]:
model.save("drive/MyDrive/Vassar/Junior Year/Spring/CMPU-366/Final Project/model/{}".format(model_folder_name))

In [None]:
from matplotlib import pyplot
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

In [None]:
import json

# save word index dictionaries
json.dump(reverse_target_word_index, open( "{}/word_idx_dict/{}/reverse_target_word_index.json".format(drive_folder_path, model_folder_name), 'w' ) )
json.dump(reverse_source_word_index, open( "{}/word_idx_dict/{}/reverse_source_word_index.json".format(drive_folder_path, model_folder_name), 'w' ) )
json.dump(target_word_index, open( "{}/word_idx_dict/{}/target_word_index.json".format(drive_folder_path, model_folder_name), 'w' ) )
