In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from tensorflow.keras import backend
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GRU, TimeDistributed, Attention, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import optimizers

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time

plt.style.use("default")
warnings.filterwarnings("ignore")

In [2]:
backend.clear_session()

In [3]:
df1 = pd.read_csv("data_clean.csv")

In [4]:
df1 = df1.drop_duplicates().reset_index(drop=True)

In [5]:
df = df1.iloc[:10000]

In [6]:
len_c = np.array([len(x.split()) for x in df['Content']])
len_s = np.array([len(x.split()) for x in df['Summary']])

In [7]:
max_len_content = 400
max_len_summary = 30

In [8]:
df = df.iloc[np.where(np.logical_and(len_c<=max_len_content, len_s<=max_len_summary))[0]].reset_index(drop=True)

In [9]:
df.tail()

Unnamed: 0,Content,Summary,Summary_clean
4787,air frost hit last week caused catastrophic da...,english winemakers warned least half year grap...,_START_ english winemakers warned least half y...
4788,com lindsay lohan make life little harder max ...,lindsay lohan going guest star broke girls pla...,_START_ lindsay lohan going guest star broke g...
4789,researchers found less third experimental clin...,young people cancer scotland fewer clinical tr...,_START_ young people cancer scotland fewer cli...
4790,real life atlantis sunk coast egypt nearly yea...,city heracleion sunk mediterranean sea years a...,_START_ city heracleion sunk mediterranean sea...
4791,david duckenfield also accepted froze afternoo...,hillsborough police match commander agreed fai...,_START_ hillsborough police match commander ag...


In [10]:
x_train, x_test, y_train, y_test = train_test_split(df['Content'].values, 
                                                    df['Summary_clean'].values, 
                                                    test_size=0.1)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1)

In [11]:
st = time.time()

tokenizer_content = Tokenizer()
tokenizer_content.fit_on_texts(x_train);

x_train = tokenizer_content.texts_to_sequences(x_train)
x_val = tokenizer_content.texts_to_sequences(x_val)
x_test = tokenizer_content.texts_to_sequences(x_test)

x_train= pad_sequences(x_train,  maxlen=max_len_content, padding='post')
x_val = pad_sequences(x_val,  maxlen=max_len_content, padding='post')
x_test = pad_sequences(x_test,  maxlen=max_len_content, padding='post')

et = time.time()
print("Time taken: {:d} h {:d} min {:.2f} s".format(int((et - st)/3600), int(((et - st)%3600)/60), ((et - st)%3600)%60))

Time taken: 0 h 0 min 0.71 s


In [12]:
st = time.time()

tokenizer_summary = Tokenizer()
tokenizer_summary.fit_on_texts(y_train);

y_train = tokenizer_summary.texts_to_sequences(y_train)
y_val = tokenizer_summary.texts_to_sequences(y_val)
y_test = tokenizer_summary.texts_to_sequences(y_test)

y_train= pad_sequences(y_train,  maxlen=max_len_summary, padding='post')
y_val = pad_sequences(y_val,  maxlen=max_len_summary, padding='post')
y_test = pad_sequences(y_test,  maxlen=max_len_summary, padding='post')

et = time.time()
print("Time taken: {:d} h {:d} min {:.2f} s".format(int((et - st)/3600), int(((et - st)%3600)/60), ((et - st)%3600)%60))

Time taken: 0 h 0 min 0.12 s


In [13]:
x_voc = len(tokenizer_content.word_index) + 1
y_voc = len(tokenizer_summary.word_index) + 1

## Encoder

In [14]:
lstm_units = 500
embedding_units = 500

encoder_input = Input(shape=(max_len_content,))

encoder_embedding = Embedding(x_voc, embedding_units, trainable=True, name="encoder_emb")(encoder_input)

encoder_lstm1 = LSTM(lstm_units, return_sequences=True, return_state=True, name="encoder_lstm1")
encoder_layer1, state_a1, state_c1 = encoder_lstm1(encoder_embedding)

encoder_lstm2 = LSTM(lstm_units, return_sequences=True, return_state=True, name="encoder_lstm2")
encoder_layer2, state_a2, state_c2 = encoder_lstm1(encoder_layer1)

## Decoder

In [15]:
decoder_input = Input(shape=(None,))

decoder_embedding = Embedding(y_voc, embedding_units, trainable=True, name="decoder_emb")
decoder_emb_layer = decoder_embedding(decoder_input)

decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_layer, decoder_state_f, decoder_state_b = decoder_lstm(decoder_emb_layer, initial_state=[state_a2, state_c2])

decoder_dense = TimeDistributed(Dense(y_voc, activation="softmax"))
decoder_output = decoder_dense(decoder_layer)

In [16]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [17]:
history = model.fit(x=[x_train, y_train[:,:-1]], 
                    y=y_train.reshape(-1, max_len_summary, 1)[:,1:], 
                    validation_data=([x_val, y_val[:,:-1]], y_val.reshape(-1, max_len_summary, 1)[:,1:]), 
                    epochs=2, 
                    callbacks=[early_stopping])

Epoch 1/2
Epoch 2/2


# Inference

In [18]:
encoder_model = Model(inputs=[encoder_input], outputs=[encoder_layer2, state_a2, state_c2])

inference_decoder_input = Input(shape=(max_len_content, lstm_units))
decoder_input_a2 = Input(shape=(lstm_units,))
decoder_input_c2 = Input(shape=(lstm_units,))

inference_decoder_emb = decoder_embedding(decoder_input)

inference_decoder_layer, state_a2, state_c2 = decoder_lstm(inference_decoder_emb, 
                                                           initial_state=[decoder_input_a2, decoder_input_a2])

inference_decoder_output = decoder_dense(inference_decoder_layer)

In [19]:
inference_model = Model([decoder_input] + [inference_decoder_input, decoder_input_a2, decoder_input_c2], 
                        [inference_decoder_output] + [state_a2, state_c2])