# New York Times News Summarization
Sirut Buasai, sbausai2@wpi.edu <br>
Jason Dykstra, jpdykstra@wpi.edu <br>
Adam Yang ayang@wpi.edu
## Libraries

In [None]:
# Data Processing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Text Processing
import nltk
import re
import spacy
from nltk.corpus import stopwords
from nltk import pos_tag_sents

# Model Building
import tensorflow as tf
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer

# Google Colab
# from google.colab import drive
# drive.mount('/content/gdrive')

# Downloads
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
NER = spacy.load("en_core_web_sm")

## Global Variables

In [None]:
MAX_ABSTRACT_LEN = 30
MAX_TITLE_LEN = 12
LATENT_DIMENSION = 300 
EMBEDDING_DIMENSION = 100 
EPOCH = 10
BATCH_SIZE = 128
DATA_SET_RATIO = 0.5

## Data Pre-processing
### Initial Data Inspection

In [None]:
raw_data_path = 'NYT_Dataset.csv'
raw_data = pd.read_csv(raw_data_path)
raw_data = raw_data[:int(DATA_SET_RATIO*len(raw_data))]
print(f'dataframe shape: {raw_data.shape}')
raw_data

### Clean Data
#### Part of Speech Processing Function

In [None]:
def pos_processing(abstract_sentences):
  abstract_tokens = pd.Series(abstract_sentences.apply(lambda sentence: sentence.split()))

  # identify parts-of-speech; pos_tag requires list of tokens
  abstract_pos = pd.Series(pos_tag_sents(abstract_tokens))

  # create attention mask(s) using parts-of-speech
  nounsList = ['NN', 'NNS', 'NNP', 'NNPS'] # Nouns
  verbsList = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VPZ'] # Verbs
  adjList = ['JJ', 'JJR', 'JJS'] # Adjectives
  noun_attn_mask = abstract_pos.apply(lambda words: [1 if pair[1] in nounsList else 0 for pair in words])
  verb_attn_mask = abstract_pos.apply(lambda words: [1 if pair[1] in verbsList else 0 for pair in words])
  adj_attn_mask = abstract_pos.apply(lambda words: [1 if pair[1] in adjList else 0 for pair in words])

  return noun_attn_mask, verb_attn_mask, adj_attn_mask

#### NER Processing Function

In [None]:
def create_ne_attn_mask(entities):
  # extract named entities
  named_entities = entities.ents
  # initialize attention mask
  attn_mask = [0 for i in range(len(entities))]

  # assign 1's to positions in attn mask corresponding to named-entity positions
  for entity in named_entities:
      attn_mask[entity.start: entity.end] = [1 for i in range(entity.start,entity.end)]

  return attn_mask

def ner_processing(abstract_sentences):
  # identify entities for each abstract
  # NOTE: this is supposedly faster if we use nlp.pipe, but we can look into that tomorrow ...
  abs_entities = pd.Series(abstract_sentences.apply(lambda sentence: NER(sentence)))

  # extract the original sentences, but now with split contractions
  original_sentence = pd.Series(abs_entities.apply(lambda entities: ' '.join([token.orth_ for token in entities])))

  # create attention mask(s) using the NE's
  ne_attention_mask = abs_entities.apply(lambda entities: create_ne_attn_mask(entities))


  return original_sentence, ne_attention_mask

#### Clean Text Function

In [None]:
def preprocess_data(df):
  # drop unnecessary columns
  unused_columns = ['Unnamed: 0', 'Date', 'ID', 'topic', 'keywords']
  df.drop(unused_columns, axis=1, inplace=True, errors='ignore')

  # drop dupliates and nan rows
  df.drop_duplicates(subset=['abstract'], inplace=True)
  df.reset_index(drop=True, inplace=True)
  df.dropna(axis=0, inplace=True)
  df.reset_index(drop=True, inplace=True)

  # convert texts to lowercase
  df['abstract'] = df['abstract'].str.lower()
  df['title'] = df['title'].str.lower()

  # strip special characters
  df['cleaned_title'] = df['title'].apply(lambda string: re.sub("[^a-zA-Z0-9 ]+", '', string))
  df['cleaned_abstract'] = df['abstract'].apply(lambda string: re.sub("[^a-zA-Z0-9 ]+", '', string))

  # convert string to list
  df['cleaned_title'] = df['cleaned_title'].apply(lambda string: string.split())
  df['cleaned_abstract'] = df['cleaned_abstract'].apply(lambda string: string.split())

  # remove stopwords
  stop_words = stopwords.words('english')
  df['cleaned_title'] = df['cleaned_title'].apply(lambda sentence: [word for word in sentence if word not in (stop_words)])
  df['cleaned_abstract'] = df['cleaned_abstract'].apply(lambda sentence: [word for word in sentence if word not in (stop_words)])

  # convert list back to string
  df['cleaned_title'] = df['cleaned_title'].apply(lambda sentence: " ".join(word for word in sentence))
  df['cleaned_abstract'] = df['cleaned_abstract'].apply(lambda sentence: " ".join(word for word in sentence))

  # ATTENTION MASK GENERATION NEEDS TO BE LAST PART OF PROCESSING FUNCTION
  # identify named-entities; NER requires string sentences
  df['cleaned_abstract'], df['ner_attn_mask'] = ner_processing(df['cleaned_abstract'])

  # generate noun, verb, adj attention mask from parts-of-speech; requires string sentences
  df['noun_attn_mask'], df['verb_attn_mask'], df['adj_attn_mask'] = pos_processing(df['cleaned_abstract'])
    
  # add start and end tokens for title
  df['cleaned_title'] = df['cleaned_title'].apply(lambda text: 'sostok ' + text + ' eostok')

  return df

cleaned_data = preprocess_data(raw_data)
print(f'dataframe shape: {cleaned_data.shape}')
cleaned_data

In [None]:
for i in range(5):
  print("Abstract:", cleaned_data['cleaned_abstract'][i])
  print("Title:", cleaned_data['cleaned_title'][i])
  print("\n")

### Insepct String Length Distribution

In [None]:
abstract_word_count = []
title_word_count = []

# populate the lists with sentence lengths
for i in cleaned_data['cleaned_abstract']:
  abstract_word_count.append(len(i.split()))

for i in cleaned_data['cleaned_title']:
  title_word_count.append(len(i.split()))

length_df = pd.DataFrame({'abstract':abstract_word_count, 'title':title_word_count})

length_df.hist(bins = 30)
plt.show()

### Function for Attention Mask Error Checking

In [None]:
def attn_mask_error_checking(abstract_text_series, attn_mask_series):
  text_list = []
  text_len_list = []
  text_split_list = []
  mask_len_list = []
  mask_list = []
  # get length of string in text
  for row in abstract_text_series:
    text_list.append(row)
    text_split_list.append(row.split())
    text_len_list.append(len(row.split()))
  # get length of attention mask
  for row in attn_mask_series:
    mask_list.append(row)
    mask_len_list.append(len(row))
    
  failed_text_split_list = []
  failed_text_list = []
  failed_text_len_list = []
  failed_mask_list = []
  failed_mask_len_list = []
  error_count = 0
  # compare the length of text with the length of attention mask
  for text_len, text_split, text, mask_len, mask in zip(text_len_list, text_split_list, text_list, mask_len_list, mask_list):
    if text_len != mask_len:
      print("length of text is not the same as length of mask")
      failed_text_split_list.append(text_split)
      failed_text_list.append(text)
      failed_mask_list.append(mask)
      failed_mask_len_list.append(mask_len)
      failed_text_len_list.append(text_len)
      error_count += 1
        
  return (failed_text_split_list, failed_text_list, failed_mask_list, failed_text_len_list, failed_mask_len_list, error_count)

for mask in ['ner_attn_mask', 'noun_attn_mask', 'verb_attn_mask', 'adj_attn_mask']:
  tsl, tl, ml, tll, mll, error = attn_mask_error_checking(cleaned_data['cleaned_abstract'], cleaned_data[mask])
  print(f"checking {mask}\n\terror: {error}")
  if error > 0:
    print(f"\t{mask} text and mask lists")
    # print(f"\t{tsl})
    # print(f"\t{tl})
    # print(f"\t{ml})
    # print(f"\t{tll})
    # print(f"\t{mll})

### Split Training and Testing Set

In [None]:
abs_tr, abs_te, ttl_tr, ttl_te = train_test_split(
  np.array(cleaned_data['cleaned_abstract']),
  np.array(cleaned_data['cleaned_title']),
  test_size=0.1,
  random_state=0,
  shuffle=True)

### Tokenize Title and Abstracts

In [None]:
def tokenize_text(train_set, test_set, string_length, rare_word_threshold):
  # find the number of rarewords
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(list(train_set))
  rare_word_count = 0
  total_word_count = 0
  for _,freq in tokenizer.word_counts.items():
    total_word_count = total_word_count + 1
    if (freq < rare_word_threshold):
      rare_word_count += 1
    
  # prepare a tokenizer for reviews on training data
  tokenizer = Tokenizer(num_words=total_word_count-rare_word_count)
  tokenizer.fit_on_texts(list(train_set))

  # convert text sequences into integer sequences
  train_set = tokenizer.texts_to_sequences(train_set) 
  test_set = tokenizer.texts_to_sequences(test_set)

  # padding zero up to maximum length
  train_set = pad_sequences(train_set, maxlen=string_length, padding='post') 
  test_set = pad_sequences(test_set, maxlen=string_length, padding='post')

  # get vector size
  vocab_size = tokenizer.num_words+1

  return tokenizer, train_set, test_set, vocab_size

abs_tokenizer, abs_tr, abs_te, abs_size = tokenize_text(abs_tr, abs_te, MAX_ABSTRACT_LEN, 3)
ttl_tokenizer, ttl_tr, ttl_te, ttl_size = tokenize_text(ttl_tr, ttl_te, MAX_TITLE_LEN, 6)

### Remove Empty Texts

In [None]:
def remove_empty_text(abstract, title):
  indices = []
  for i in range(len(ttl_tr)):
      cnt = 0
      for j in ttl_tr[i]:
          if j!=0:
              cnt=cnt+1
      if (cnt == 2):
          indices.append(i)

  abstract = np.delete(abs_tr, indices, axis=0)
  title = np.delete(ttl_tr, indices, axis=0)
  print(f"removed {len(indices)} empty texts.")

  return abstract, title

abs_tr, ttl_tr = remove_empty_text(abs_tr, ttl_tr)
abs_te, ttl_te = remove_empty_text(abs_te, ttl_te)

## Build LSTM Model

In [None]:
tf.keras.backend.clear_session() 

# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(MAX_ABSTRACT_LEN,))

# embedding layer
enc_emb =  tf.keras.layers.Embedding(abs_size, 
                                     EMBEDDING_DIMENSION, 
                                     trainable=True)(encoder_inputs)

# encoder lstm 1
encoder_lstm1 = tf.keras.layers.LSTM(LATENT_DIMENSION, 
                                     return_sequences=True, 
                                     return_state=True, 
                                     dropout=0.4, 
                                     recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

# encoder lstm 2
encoder_lstm2 = tf.keras.layers.LSTM(LATENT_DIMENSION,
                                     return_sequences=True,
                                     return_state=True,
                                     dropout=0.4,
                                     recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# encoder lstm 3
encoder_lstm3 = tf.keras.layers.LSTM(LATENT_DIMENSION, 
                                     return_state=True, 
                                     return_sequences=True,
                                     dropout=0.4,
                                     recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# set up the decoder, using `encoder_states` as initial state.
decoder_inputs = tf.keras.layers.Input(shape=(None,))

# embedding layer
dec_emb_layer = tf.keras.layers.Embedding(ttl_size, 
                                          EMBEDDING_DIMENSION, 
                                          trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = tf.keras.layers.LSTM(LATENT_DIMENSION, 
                                    return_sequences=True, 
                                    return_state=True,
                                    dropout=0.4,
                                    recurrent_dropout=0.2)
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,
                                                                     initial_state=[state_h, state_c])

# NOTE: taking attention layer out for now since tutorial uses custom attention layer
# attention layer
attn_layer = tf.keras.layers.Attention()
attn_out = attn_layer([decoder_outputs, encoder_outputs])

# concat attention input and decoder LSTM output
decoder_concat_input = tf.keras.layers.Concatenate()([decoder_outputs, attn_out])

# dense layer
decoder_dense =  tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(ttl_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)
# decoder_outputs = decoder_dense(decoder_outputs)

# define the model 
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary() 

### Train the Model

In [None]:
# initialize model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# train model
history = model.fit([abs_tr, ttl_tr[:,:-1]], ttl_tr.reshape(ttl_tr.shape[0], ttl_tr.shape[1], 1)[:,1:],
                    epochs=EPOCH,
                    batch_size=BATCH_SIZE,
                    validation_data=([abs_te,ttl_te[:,:-1]], ttl_te.reshape(ttl_te.shape[0], ttl_te.shape[1], 1)[:,1:]))

### Inspect Training Validation Loss

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

### Vec2Word Dictionary

In [None]:
reverse_target_word_index=ttl_tokenizer.index_word 
reverse_source_word_index=abs_tokenizer.index_word 
target_word_index=ttl_tokenizer.word_index

## Build Inference Model

In [None]:
# encode the input sequence to get the feature vector
encoder_model = tf.keras.models.Model(inputs=encoder_inputs,
                                      outputs=[encoder_outputs, state_h, state_c])

# decoder setup
# below tensors will hold the states of the previous time step
decoder_state_input_h = tf.keras.layers.Input(shape=(LATENT_DIMENSION,))
decoder_state_input_c = tf.keras.layers.Input(shape=(LATENT_DIMENSION,))
decoder_hidden_state_input = tf.keras.layers.Input(shape=(MAX_ABSTRACT_LEN, LATENT_DIMENSION))

# get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs) 
# to predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# NOTE: taking attention layer out for now since tutorial uses custom attention layer
# attention inference
attn_out_inf = attn_layer([decoder_outputs2, decoder_hidden_state_input])
decoder_inf_concat = tf.keras.layers.Concatenate()([decoder_outputs2, attn_out_inf])

# a dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat) 
# decoder_outputs2 = decoder_dense(decoder_outputs2) 

# final decoder model
decoder_model = tf.keras.models.Model([decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
                                      [decoder_outputs2] + [state_h2, state_c2])

### Decode Inference Output Function

In [None]:
def decode_sequence(input_seq):
  # encode the input as state vectors.
  e_out, e_h, e_c = encoder_model.predict(input_seq, verbose=0)
  
  # generate empty target sequence of length 1.
  target_seq = np.zeros((1,1))
  
  # populate the first word of target sequence with the start word.
  target_seq[0, 0] = target_word_index['sostok']

  stop_condition = False
  decoded_sentence = ''
  while not stop_condition:
  
    output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c], verbose=0)

    # sample a token
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_token = reverse_target_word_index[sampled_token_index]
    
    if (sampled_token != 'eostok'):
      decoded_sentence += ' ' + sampled_token

    # exit condition: either hit max length or find stop word.
    if (sampled_token == 'eostok' or len(decoded_sentence.split()) >= (MAX_TITLE_LEN-1)):
      stop_condition = True

    # update the target sequence (of length 1).
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = sampled_token_index

    # update internal states
    e_h, e_c = h, c

  return decoded_sentence

### Seq2Abstract and Seq2Title Function

In [None]:
def seq2abstract(input_seq):
    newString = ''
    for i in input_seq:
      if (i != 0):
        newString = newString + reverse_source_word_index[i] + ' '
    return newString

def seq2title(input_seq):
    newString = ''
    for i in input_seq:
      if (i != 0 and i != target_word_index['sostok']) and (i != target_word_index['eostok']):
        newString = newString + reverse_target_word_index[i] + ' '
    return newString

## Testing Model

In [None]:
for i in range(len(abs_te)):
  print("Abstract:", seq2abstract(abs_te[i]))
  print("Original Title:", seq2title(ttl_te[i]))
  print("Predicted Title:", decode_sequence(abs_te[i].reshape(1, MAX_ABSTRACT_LEN)))
  print("\n")