The goal of this project is to implement seq2seq text summarization on CNN daily mail news dataset. 

Let's first connect our google drive with this colab notebook. 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Then import all the necessary libaries.

In [2]:
#Importing all the packages required for project.
import numpy as np
import pandas as pd
import re
import string
import csv
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
#from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding, Concatenate, TimeDistributed, Bidirectional, GRU
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils.vis_utils import plot_model

Load the data. We have taken CNN daily mail news data and this can be downloaded from Kaggle or from this [git repo](https://github.com/JafferWilson/Process-Data-of-CNN-DailyMail). 

In [3]:
training_data = pd.read_csv('/content/drive/My Drive/NLP/Project/cnnnews/train.csv')
testing_data = pd.read_csv('/content/drive/My Drive/NLP/Project/cnnnews/test.csv')

The data consist of 3 fields - id, articles and highlights. 
Lets drop the ID column.

In [4]:
#dropping the ID column from the training data.
training_data = training_data.drop(['id'], axis=1)
training_data = training_data.reset_index(drop=True)
#training_data.head()

#dropping the ID column from the testing data
testing_data = testing_data.drop(['id'], axis=1)
testing_data = testing_data.reset_index(drop=True)
#testing_data.head()

As the dataset is based on news from both cnn and daily mail, it consits of contarction words that are used by the journalists. We have ddefined some popular contraction words. 
Also for reference : [popular contarctions](https://www.sjsu.edu/writingcenter/docs/handouts/Contractions.pdf)

In [5]:
#contractions that are used in here.
combining_words_shortened_forms = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}


Now we clean the data and then trim it.

In [6]:
#function to clean the data and trim the data.
def cleaningdata(text, remove_stopwords=True):
    text = text.lower()
    text = text.split()
    tmp = []
    for word in text:
        if word in combining_words_shortened_forms:
            tmp.append(combining_words_shortened_forms[word])
        else:
            tmp.append(word)
    text = ' '.join(tmp)
    
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words('english'))
        text = [w for w in text if w not in stops]
        text = ' '.join(text)
        
    return text


In [7]:
# importing the required modules such as Stopwords.
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

clean_summary_list = []
for summary in training_data.highlights:
    clean_summary_list.append(cleaningdata(summary, remove_stopwords=False))
#print('Cleaning Summaries Complete')
    
cleaned_text_list = []
for text in training_data.article:
    cleaned_text_list.append(cleaningdata(text))
#print('Cleaning Texts Complete')

#deleting the trained data.
del training_data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
clean_df = pd.DataFrame()
clean_df['text'] = cleaned_text_list[:110000]
clean_df['summary'] = clean_summary_list[:110000]
clean_df['summary'].replace('', np.nan, inplace=True)
clean_df.dropna(axis=0, inplace=True)

clean_df['summary'] = clean_df['summary'].apply(lambda x: '<sostok>' + ' ' + x + ' ' + '<eostok>')
del cleaned_text_list
del clean_summary_list

In [9]:
train_x, test_x, train_y, test_y = train_test_split(clean_df['text'], clean_df['summary'], test_size=0.1, random_state=0)
del clean_df

Tokenizing the data


In [10]:
s_tokenizer = Tokenizer()
s_tokenizer.fit_on_texts(list(train_y))

thresholdvalue = 6
count = 0
complete_count = 0
frequency = 0
complete_frequency = 0

for key, value in s_tokenizer.word_counts.items():
    complete_count += 1
    complete_frequency += value
    if value < thresholdvalue:
        count += 1
        frequency += value


In [11]:
s_max_features = complete_count-count

In [12]:
t_tokenizer = Tokenizer()
t_tokenizer.fit_on_texts(list(train_x))

thresholdvalue = 4
count = 0
complete_count = 0
frequency = 0
complete_frequency = 0

for key, value in t_tokenizer.word_counts.items():
    complete_count += 1
    complete_frequency += value
    if value < thresholdvalue:
        count += 1
        frequency += value



In [13]:
t_max_features = complete_count - count

In [14]:
maximumlength_text = 800
maximumlength_summary = 150

Validation data


In [15]:
val_x = test_x
t_tokenizer = Tokenizer(num_words=t_max_features)
t_tokenizer.fit_on_texts(list(train_x))
train_x = t_tokenizer.texts_to_sequences(train_x)
val_x = t_tokenizer.texts_to_sequences(val_x)

train_x = pad_sequences(train_x, maxlen=maximumlength_text, padding='post')
val_x = pad_sequences(val_x, maxlen=maximumlength_text, padding='post')

In [16]:
val_y = test_y
s_tokenizer = Tokenizer(num_words=s_max_features)
s_tokenizer.fit_on_texts(list(train_y))
train_y = s_tokenizer.texts_to_sequences(train_y)
val_y = s_tokenizer.texts_to_sequences(val_y)

train_y = pad_sequences(train_y, maxlen=maximumlength_summary, padding='post')
val_y = pad_sequences(val_y, maxlen=maximumlength_summary, padding='post')

In [17]:
print("Training Sequence", train_x.shape)
print('Target Values Shape', train_y.shape)
print('Test Sequence', val_x.shape)
print('Target Test Shape', val_y.shape)

Training Sequence (917, 800)
Target Values Shape (917, 150)
Test Sequence (102, 800)
Target Test Shape (102, 150)


Embedding

In [18]:
embeding_index = {}
embed_dim = 100
with open('/content/drive/My Drive/NLP/Project/cnnnews/glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeding_index[word] = coefs

In [19]:
t_embed = np.zeros((t_max_features, embed_dim))
for word, i in t_tokenizer.word_index.items():
    vec = embeding_index.get(word)
    if i < t_max_features and vec is not None:
        t_embed[i] = vec

In [20]:
s_embed = np.zeros((s_max_features, embed_dim))
for word, i in s_tokenizer.word_index.items():
    vec = embeding_index.get(word)
    if i < s_max_features and vec is not None:
        s_embed[i] = vec

In [21]:
del embeding_index

Defining the model

In [22]:
latent_dim = 128

# Encoder
encoder_input = Input(shape=(maximumlength_text, ))
encoder_embed = Embedding(t_max_features, embed_dim, input_length=maximumlength_text, weights=[t_embed], trainable=False)(encoder_input)

encoder_lstm = Bidirectional(LSTM(latent_dim, return_state=True))
encoder_output, enc_fh, enc_fc, enc_bh, enc_bc = encoder_lstm(encoder_embed)
enc_h = Concatenate(axis=-1, name='enc_h')([enc_fh, enc_bh])
enc_c = Concatenate(axis=-1, name='enc_c')([enc_fc, enc_bc])

#Decoder
decoder_input = Input(shape=(None, ))
decoder_embed = Embedding(s_max_features, embed_dim, weights=[s_embed], trainable=False)(decoder_input)
decoder_lstm = LSTM(latent_dim*2, return_sequences=True, return_state=True, dropout=0.3, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(decoder_embed, initial_state=[enc_h, enc_c])

dec_dense = TimeDistributed(Dense(s_max_features, activation='softmax'))
dec_output = dec_dense(decoder_outputs)

model = Model([encoder_input, decoder_input], dec_output)
#model.summary()





Compile the model

In [23]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop')
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
model.fit([train_x, train_y[:, :-1]], train_y.reshape(train_y.shape[0], train_y.shape[1], 1)[:, 1:], epochs=10, callbacks=[early_stop], batch_size=128, verbose=2, validation_data=([val_x, val_y[:, :-1]], val_y.reshape(val_y.shape[0], val_y.shape[1], 1)[:, 1:]))

Epoch 1/10
8/8 - 25s - loss: 4.0915 - val_loss: 1.7080 - 25s/epoch - 3s/step
Epoch 2/10
8/8 - 7s - loss: 1.8316 - val_loss: 1.6674 - 7s/epoch - 844ms/step
Epoch 3/10
8/8 - 6s - loss: 1.8080 - val_loss: 1.7856 - 6s/epoch - 695ms/step
Epoch 4/10
8/8 - 5s - loss: 1.8150 - val_loss: 1.6816 - 5s/epoch - 685ms/step
Epoch 4: early stopping


<keras.callbacks.History at 0x7f1a4ce46940>

Initializing and calling the model

In [24]:
encoder_model = Model(inputs=encoder_input, outputs=[enc_h, enc_c])

dec_init_state_h = Input(shape=(latent_dim*2, ))
dec_init_state_c = Input(shape=(latent_dim*2, ))

dec_out, dec_h, dec_c = decoder_lstm(decoder_embed, initial_state=[dec_init_state_h, dec_init_state_c])
dec_final = dec_dense(dec_out)

decoder_model = Model([decoder_input]+[dec_init_state_h, dec_init_state_c], [dec_final]+[dec_h, dec_c])

Defining a function for summary generation using the above defined model

In [25]:
def summary_generator(input_seq):
    h, c = encoder_model.predict(input_seq)
    
    upcomming_token = np.zeros((1, 1))
    upcomming_token[0, 0] = s_tokenizer.word_index['sostok']
    output_seq = ''
    
    stop = False
    count = 0
    
    while not stop:
        if count > 100:
            break
        decoder_out, state_h, state_c = decoder_model.predict([upcomming_token]+[h, c])
        token_idx = np.argmax(decoder_out[0, -1, :])
        
        if token_idx == s_tokenizer.word_index['eostok']:
            stop = True
        elif token_idx > 0 and token_idx != s_tokenizer.word_index['sostok']:
            token = s_tokenizer.index_word[token_idx]
            output_seq = output_seq + ' ' + token
        
        upcomming_token = np.zeros((1, 1))
        upcomming_token[0, 0] = token_idx
        h, c = state_h, state_c
        count += 1
        
    return output_seq

In [26]:
testing_inputs = [cleaningdata(sent) for sent in testing_data.article]
testing_inputs = t_tokenizer.texts_to_sequences(list(testing_inputs))
testing_inputs = pad_sequences(testing_inputs, maxlen=maximumlength_text, padding='post')


Now, saving the summaries to a file named results.

In [27]:
datas = []
with open('/content/drive/My Drive/NLP/Project/cnnnews/result.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Article', 'Original Summary', 'Model Output'])
    for i in range(500):
        our_summ = summary_generator(testing_inputs[i].reshape(1, maximumlength_text))
        datas.append(our_summ)
        writer.writerow([testing_data.article[i], testing_data.highlights[i], our_summ])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


The summaries for the articles have been stored in results.csv file. 
