# New York Times News Summarization
Sirut Buasai, sbausai2@wpi.edu <br>
Jason Dykstra, jpdykstra@wpi.edu <br>
Adam Yang ayang@wpi.edu
## Libraries

In [None]:
# Data Processing
import pandas as pd
from sklearn.model_selection import train_test_split

# Text Processing
import nltk
import re
from nltk.corpus import stopwords
from nltk import pos_tag_sents
import spacy
NER = spacy.load("en_core_web_sm")

# Model Building
import tensorflow as tf
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer


nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

## Data Pre-processing
### Initial Data Inspection

In [None]:
raw_data_path = 'NYT_Dataset.csv'
raw_data = pd.read_csv(raw_data_path)
print(f'dataframe shape: {raw_data.shape}')
raw_data

### Clean Data

In [31]:
# FUNCTION FOR PARTS-OF-SPEECH PROCESSING
def pos_processing(abstract_tokens):
    # identify parts-of-speech; pos_tag requires list of tokens
    abstract_pos = pd.Series(pos_tag_sents(abstract_tokens))
    
    # create attention mask(s) using parts-of-speech
    nounsList = ['NN', 'NNS', 'NNP', 'NNPS'] # nouns to pay attention to
    noun_attention_mask = abstract_pos.apply(lambda words: [1 if pair[1] in nounsList else 0 for pair in words])

    return noun_attention_mask

In [32]:
# FUNCTIONS FOR NER PROCESSING
def create_ne_attn_mask(entities):
    # extract named entities
    named_entities = entities.ents
    # initialize attention mask
    attn_mask = [0 for i in range(len(entities))]

    # assign 1's to positions in attn mask corresponding to named-entity positions
    for entity in named_entities:
        attn_mask[entity.start: entity.end] = [1 for i in range(entity.start,entity.end)]

    return attn_mask


def ner_processing(abstract_sentences):
    # identify entities for each abstract
    # NOTE: this is supposedly faster if we use nlp.pipe, but we can look into that tomorrow ...
    abs_entities = pd.Series(abstract_sentences.apply(lambda sentence: NER(sentence)))

    # create attention mask(s) using the NE's
    ne_attention_mask = abs_entities.apply(lambda entities: create_ne_attn_mask(entities))

    return ne_attention_mask


In [None]:
# GENERAL DATA-PREPROCESSING FUNCTION
def preprocess_data(df):
    # drop unnecessary columns
    unused_columns = ['Unnamed: 0', 'Date', 'ID', 'topic', 'keywords']
    df.drop(unused_columns, axis=1, inplace=True, errors='ignore')

    # drop dupliates and nan rows
    df.drop_duplicates(subset=['abstract'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.dropna(axis=0, inplace=True)
    df.reset_index(drop=True, inplace=True)

    # convert texts to lowercase
    df['abstract'] = df['abstract'].str.lower()
    df['title'] = df['title'].str.lower()

    # strip special characters
    df['cleaned_title'] = df['title'].apply(lambda string: re.sub("[^a-zA-Z0-9 ]+", '', string))
    df['cleaned_abstract'] = df['abstract'].apply(lambda string: re.sub("[^a-zA-Z0-9 ]+", '', string))

    # convert string to list
    df['cleaned_title'] = df['cleaned_title'].apply(lambda string: string.split())
    df['cleaned_abstract'] = df['cleaned_abstract'].apply(lambda string: string.split())

    # remove stopwords
    stop_words = stopwords.words('english')
    df['cleaned_title'] = df['cleaned_title'].apply(lambda sentence: [word for word in sentence if word not in (stop_words)])
    df['cleaned_abstract'] = df['cleaned_abstract'].apply(lambda sentence: [word for word in sentence if word not in (stop_words)])

    # generate noun attention mask from parts-of-speech
    df['noun_attn_mask'] = pos_processing(df['cleaned_abstract'])

    # convert list back to string
    df['cleaned_title'] = df['cleaned_title'].apply(lambda sentence: " ".join(word for word in sentence))
    df['cleaned_abstract'] = df['cleaned_abstract'].apply(lambda sentence: " ".join(word for word in sentence))
    
    # identify named-entities; NER requires string sentences
    df['ne_attention_mask'] = ner_processing(df['cleaned_abstract'])
    
    # add start and end tokens for title
    df['cleaned_title'] = df['cleaned_title'].apply(lambda text: '_START_ ' + text + ' _END_')

    return df

cleaned_data = preprocess_data(raw_data)
print(f'dataframe shape: {cleaned_data.shape}')
cleaned_data

### Declare Maximum Title and Abstract Length

In [None]:
MAX_TITLE_LEN = 100
MAX_ABSTRACT_LEN = 20

### Split Training and Testing Set

In [None]:
abs_tr, abs_te, ttl_tr, ttl_te = train_test_split(
  cleaned_data['cleaned_abstract'],
  cleaned_data['cleaned_title'],
  test_size=0.1,
  random_state=0,
  shuffle=True)

### Tokenize Title and Abstracts

In [None]:
def tokenize_text(train_set, test_set):
  # prepare a tokenizer for reviews on training data
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(list(train_set))

  # convert text sequences into integer sequences
  train_set = tokenizer.texts_to_sequences(train_set) 
  test_set = tokenizer.texts_to_sequences(test_set)

  # padding zero up to maximum length
  train_set = pad_sequences(train_set, maxlen=MAX_ABSTRACT_LEN, padding='post') 
  test_set = pad_sequences(test_set, maxlen=MAX_ABSTRACT_LEN, padding='post')

  # get vector size
  vocab_size = len(tokenizer.word_index)+1

  return tokenizer, train_set, test_set, vocab_size

abs_tokenizer, abs_tr, abs_te, abs_size = tokenize_text(abs_tr, abs_te)
ttl_tokenizer, ttl_tr, ttl_te, ttl_size = tokenize_text(ttl_tr, ttl_te)

## Build LSTM Model

In [None]:
from keras import backend as K 
K.clear_session() 
latent_dim = 500 

# encoder 
encoder_inputs = tf.keras.layers.Input(shape=(MAX_ABSTRACT_LEN,)) 
enc_emb = tf.keras.layers.Embedding(abs_size, latent_dim,trainable=True)(encoder_inputs) 

# LSTM 1 
encoder_lstm1 = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) 

# LSTM 2 
encoder_lstm2 = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# LSTM 3 
encoder_lstm3 = tf.keras.layers.LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2) 

# set up the decoder. 
decoder_inputs = tf.keras.layers.Input(shape=(None,)) 
dec_emb_layer = tf.keras.layers.Embedding(ttl_size, latent_dim, trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs) 

# LSTM using encoder_states as initial state
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c]) 

# dense layer
decoder_dense = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(ttl_size, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_outputs) 

# define the model
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()

### Train the Model

In [None]:
# initialize model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# initialize early stopping when validation loss increases
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1)

# train model
history = model.fit([abs_tr, ttl_tr[:,:-1]], ttl_tr.reshape(ttl_tr.shape[0], ttl_tr.shape[1], 1)[:,1:],
                    epochs=50,
                    callbacks=[es],
                    batch_size=512,
                    validation_data=([abs_te,ttl_te[:,:-1]], ttl_te.reshape(ttl_te.shape[0], ttl_te.shape[1], 1)[:,1:]))
