In [None]:
import os
import re
import unicodedata
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Input
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Layer
from tensorflow.keras import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [None]:
BASE_DIR = '../input/news-summary'
START_TOKEN = '<start> '
END_TOKEN = ' <end>'

In [None]:
!ls {BASE_DIR}

In [None]:
def read_data():
    """Read the data."""
    news1_df = pd.read_csv('../input/news-summary/news_summary.csv', encoding='latin-1', usecols=['headlines', 'text'])
    news2_df = pd.read_csv('../input/news-summary/news_summary_more.csv', encoding='latin-1')
    
    return pd.concat([news1_df, news2_df], axis=0).reset_index(drop=True)

In [None]:
full_df = read_data()
full_df.head()

In [None]:
full_df.shape

In [None]:
def print_head(limit=5):
    for idx in range(limit):
        print(f'TITLE: {full_df["headlines"][idx]}\nTEXT: {full_df["text"][idx]}\n')
print_head()

## Preprocessing

In [None]:
def preprocess(text):
    """Preprocess the given text."""
    
    # Encode to ascii
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'
    )
    
    # To lowercase
    text = text.lower()

    text = re.sub("(\\t)", ' ', text)  #remove escape charecters
    text = re.sub("(\\r)", ' ', text)
    text = re.sub("(\\n)", ' ', text)
    text = re.sub("(__+)", ' ', text)   #remove _ if it occors more than one time consecutively
    text = re.sub("(--+)", ' ', text)   #remove - if it occors more than one time consecutively
    text = re.sub("(~~+)", ' ', text)   #remove ~ if it occors more than one time consecutively
    text = re.sub("(\+\++)", ' ', text)   #remove + if it occors more than one time consecutively
    text = re.sub("(\.\.+)", ' ', text)   #remove . if it occors more than one time consecutively
    text = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', text) #remove <>()|&©ø"',;?~*!
    text = re.sub("(mailto:)", ' ', text)  #remove mailto:
    text = re.sub(r"(\\x9\d)", ' ', text)  #remove \x9* in text
    text = re.sub("([iI][nN][cC]\d+)", 'INC_NUM', text)  #replace INC nums to INC_NUM
    text = re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', text)  #replace CM# and CHG# to CM_NUM
    text = re.sub("(\.\s+)", ' ', text)  #remove full stop at end of words(not between)
    text = re.sub("(\-\s+)", ' ', text)  #remove - at end of words(not between)
    text = re.sub("(\:\s+)", ' ', text)  #remove : at end of words(not between)
    text = re.sub("(\s+.\s+)", ' ', text)  #remove any single charecters hanging between 2 spaces

    #Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
    try:
        url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', text)
        repl_url = url.group(3)
        text = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, text)
    except:
        pass #there might be emails with no url in them

    text = re.sub("(\s+)",' ',text) #remove multiple spaces
    text = re.sub("(\s+.\s+)", ' ', text) #remove any single charecters hanging between 2 spaces
    return text

In [None]:
%%time
full_df['headlines'] = full_df['headlines'].apply(preprocess)
full_df['text'] = full_df['text'].apply(preprocess)

In [None]:
# After preprocessing
print_head()

## Max length analysis

In [None]:
text_lens = full_df['text'].str.split().apply(len)
headline_lens = full_df['headlines'].str.split().apply(len)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.distplot(headline_lens)
plt.title('Headlines length distribution')

plt.subplot(1, 2, 2)
sns.distplot(text_lens)
plt.title('Text length distribution')

plt.show()

In [None]:
# Checking mean lengths
print(f'Mean headline length: {headline_lens.mean()}')
print(f'Mean text length: {text_lens.mean()}')

In [None]:
# Check how much % of headlines have 0-15 words
print(f"Headlines having length in range [0, 15]: {len(headline_lens[headline_lens <= 15])/len(headline_lens)}")

# Check how much % of text have 0-62 words
print(f"Text having length in range [0, 62]: {len(text_lens[text_lens <= 62])/len(text_lens)}")

In [None]:
MAX_TEXT_SEQ_LEN = 62
MAX_HEADLINE_SEQ_LEN = 15

In [None]:
full_df['headlines_input'] = START_TOKEN + full_df['headlines']
full_df['headlines_output'] = full_df['headlines'] + END_TOKEN

In [None]:
full_df = full_df.drop(['headlines'], axis=1)
full_df.head(2)

In [None]:
X_train, X_test = train_test_split(full_df, test_size=0.1)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

print(X_train.shape)
print(X_test.shape)

In [None]:
X_train.head(2)

In [None]:
X_train['headlines_input'][0], X_train['headlines_output'][0] 

In [None]:
def data_preparation(X_train, X_test):
    """Tokenize and pad the given text."""
    
    # Fit tokenizers
    text_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
    text_tokenizer.fit_on_texts(X_train['text'])

    X_train['headlines_input'][0] = X_train['headlines_input'][0] + END_TOKEN
    headline_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
    headline_tokenizer.fit_on_texts(X_train['headlines_input'])
    
    # Pad sequences
    text_train = pad_sequences(text_tokenizer.texts_to_sequences(X_train['text']), maxlen=MAX_TEXT_SEQ_LEN, padding='post', truncating='post')
    text_test = pad_sequences(text_tokenizer.texts_to_sequences(X_test['text']), maxlen=MAX_TEXT_SEQ_LEN, padding='post', truncating='post')

    headline_train_input = pad_sequences(headline_tokenizer.texts_to_sequences(X_train['headlines_input']), maxlen=MAX_HEADLINE_SEQ_LEN, padding='post', truncating='post')
    headline_train_output = pad_sequences(headline_tokenizer.texts_to_sequences(X_train['headlines_output']), maxlen=MAX_HEADLINE_SEQ_LEN, padding='post', truncating='post')
    headline_test_input = pad_sequences(headline_tokenizer.texts_to_sequences(X_test['headlines_input']), maxlen=MAX_HEADLINE_SEQ_LEN, padding='post', truncating='post')
    headline_test_output = pad_sequences(headline_tokenizer.texts_to_sequences(X_test['headlines_output']), maxlen=MAX_HEADLINE_SEQ_LEN, padding='post', truncating='post')

    return {
        'text_tokenizer': text_tokenizer,
        'headline_tokenizer': headline_tokenizer,
        'text_train': text_train,
        'text_test': text_test,
        'headline_train_input': headline_train_input,
        'headline_train_output': headline_train_output,
        'headline_test_input': headline_test_input,
        'headline_test_output': headline_test_output
    }

In [None]:
%%time
data = data_preparation(X_train, X_test)

In [None]:
text_vocab_size = len(data['text_tokenizer'].word_index) + 1
headline_vocab_size = len(data['headline_tokenizer'].word_index) + 1

print(f'Text vocab size: {text_vocab_size}')
print(f'Headline vocab size: {headline_vocab_size}')

## Creating Pretrained Embedding Matrix

In [None]:
EMBEDDING_DIM = 100

In [None]:
%%time
embeddings_index = dict()
f = open(f'../input/glove6b/glove.6B.{EMBEDDING_DIM}d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1: ], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print(f'Found {len(embeddings_index)} word vectors.')

headline_embedding_matrix = np.zeros((headline_vocab_size, EMBEDDING_DIM))
for word, i in data['headline_tokenizer'].word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        headline_embedding_matrix[i] = embedding_vector

text_embedding_matrix = np.zeros((text_vocab_size, EMBEDDING_DIM))
for word, i in data['text_tokenizer'].word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        text_embedding_matrix[i] = embedding_vector

In [None]:
print(f'Shape of headline embedding matrix: {headline_embedding_matrix.shape}')
print(f'Shape of text embedding matrix: {text_embedding_matrix.shape}')

## Modeling

In [None]:
class Encoder(Layer):
    def __init__(self, name):
        super().__init__(name=name)
        
        self.embedding = Embedding(input_dim=text_vocab_size, output_dim=EMBEDDING_DIM, weights=[text_embedding_matrix], input_length=MAX_TEXT_SEQ_LEN, trainable=True, name='encoder_embedding')
        # Store encoder hidden state and cell state for decoder input. Hidden state is the output of last timestamp,
        # which represents the entire input sequence using a single vector.
        self.lstm = LSTM(units=128, return_sequences=True, return_state=True, name='encoder_lstm')

    def call(self, x):
        x = self.embedding(x)
        self.lstm_output, self.lstm_hidden, self.lstm_cell = self.lstm(x)
        return self.lstm_output, self.lstm_hidden, self.lstm_cell
    
    def get_states(self):
        return self.lstm_hidden, self.lstm_cell

class Decoder(Layer):
    def __init__(self, name):
        super().__init__(name=name)
        
        self.embedding = Embedding(input_dim=headline_vocab_size, output_dim=EMBEDDING_DIM, trainable=True, weights=[headline_embedding_matrix], input_length=None, name='decoder_embedding')
        self.lstm = LSTM(units=128, return_sequences=True, return_state=True, name='decoder_lstm')
    
    
    def call(self, x, lstm_hidden, lstm_cell):
        x = self.embedding(x)
        lstm_output, lstm_hidden, lstm_cell = self.lstm(x, initial_state=[lstm_hidden, lstm_cell])
        return lstm_output, lstm_hidden, lstm_cell

class EncoderDecoder(Model):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder(name='encoder')
        self.decoder = Decoder(name='decoder')
        self.decoder_dense = TimeDistributed(Dense(units=headline_vocab_size, activation='softmax'), name='decoder_dense')
    
    
    def call(self, x):
        text, summary = x
        _, hidden_state, cell_state = self.encoder(text)
        out, hidden_state, cell_state = self.decoder(summary, hidden_state, cell_state)
        return self.decoder_dense(out)

In [None]:
model = EncoderDecoder()

In [None]:
optimizer = Adam(learning_rate=0.002)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')

In [None]:
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=3,
    verbose=1
)

# Train the model
history = model.fit(
    [data['text_train'], data['headline_train_input']],
    data['headline_train_output'],
    batch_size=512,
    epochs=100,
    validation_data=([data['text_test'], data['headline_test_input']], data['headline_test_output']),
    callbacks=[reduce_lr]
)

In [None]:
def plot(history):
    plt.figure(figsize=(12, 6))
    plt.style.use('ggplot')
    plt.subplot(1, 2, 1)
    plt.plot(np.arange(0, len(history.history['loss'])), history.history['loss'], label='train_loss')
    plt.plot(np.arange(0, len(history.history['val_loss'])), history.history['val_loss'], label='validation_loss')

    plt.title('Epochs vs. Loss')
    plt.xlabel('Epoch #')
    plt.ylabel('Loss')
    plt.legend(loc='upper right')

    plt.style.use('ggplot')
    plt.subplot(1, 2, 2)
    plt.plot(np.arange(0, len(history.history['accuracy'])), history.history['accuracy'], label='train_accuracy')
    plt.plot(np.arange(0, len(history.history['val_accuracy'])), history.history['val_accuracy'], label='validation_accuracy')

    plt.title('Epochs vs. Accuracy')
    plt.xlabel('Epoch #')
    plt.ylabel('Loss')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()

In [None]:
plot(history)

## Inference using Seq2Seq model

In [None]:
# Inspecting model layers
for idx, layer in enumerate(model.layers):
    print(f'{idx} => {layer.name}')

We first have to encode the input sequence using encoder, so that the encoder output state can be used by decoder to generate text summary (news headline in this case).

In [None]:
len(data['headline_tokenizer'].word_index.items())

In [None]:
index_to_word_text, index_to_word_headline  = {}, {}

for key, val in data['text_tokenizer'].word_index.items():
    index_to_word_text[val] = key

for key, val in data['headline_tokenizer'].word_index.items():
    index_to_word_headline[val] = key
    
index_to_word_text[0] = '<pad>'
index_to_word_headline[0] = '<pad>'

In [None]:
# Example text to summarize
sample_text = data['text_test'][0]
sample_text

In [None]:
fun_text = np.vectorize(lambda x: index_to_word_text[x])
fun_headline = np.vectorize(lambda x: index_to_word_headline[x])

In [None]:
fun_text(sample_text)

In [None]:
sample_decoder_input = data['headline_test_input'][0]
sample_decoder_input

In [None]:
fun_headline(sample_decoder_input)

In [None]:
def summarize(long_text):
    long_text = long_text.reshape(1, -1)
    out, hidden_state, cell_state = model.get_layer('encoder')(long_text)
    summary = []
    
    output = np.array([1]).reshape(1, -1)
    while True:
        output, hidden_state, cell_state = model.get_layer('decoder')(output, hidden_state, cell_state)
        output = np.argmax(model.get_layer('decoder_dense')(output), axis=-1)
        pred = index_to_word_headline[int(output)]
    
        if pred == '<end>' or len(summary) >= MAX_HEADLINE_SEQ_LEN:
            break
        
        summary.append(pred)

    return ' '.join(summary)

In [None]:
for idx in range(100, 200, 1):
    predicted = summarize(data['text_test'][idx])
    actual = ' '.join([word for word in fun_headline(data['headline_test_output'][idx]) if word not in ['<pad>', '<end>']])

    print(f'GENERATED: {predicted}\n\nACTUAL: {actual}\n\nACTUAL TEXT: {" ".join([word for word in fun_text(data["text_test"][idx]) if word not in ["<pad>", "<end>"]])}\n')
    print('='*128)