In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, InputLayer, TimeDistributed
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import train_test_split    

import string, re

In [105]:
summary = pd.read_csv("./dataset/news_summary.csv", encoding='iso-8859-1')
raw = pd.read_csv("./dataset/news_summary_more.csv", encoding='iso-8859-1')

In [106]:
config = {'min_text_len':40,
          'max_text_len':60,
          'max_summary_len':30,
          'latent_dim' : 300,
          'embedding_dim' : 200}

In [None]:
df = pd.concat([raw, summary]).reset_index(drop=True)

print(f'Before filtering: {df.shape}')
df = df.loc[((df['text'].str.split(" ").str.len()>config['min_text_len']) & (df['text'].str.split(" ").str.len()<config['max_text_len']))].reset_index(drop=True)
print(f'After filtering: {df.shape}')
df.columns

In [108]:
def text_strip(sentence):

  sentence = re.sub("(\\t)", " ", str(sentence)).lower()
  sentence = re.sub("(\\r)", " ", str(sentence)).lower()
  sentence = re.sub("(\\n)", " ", str(sentence)).lower()

  sentence = re.sub("(--+)", " ", str(sentence)).lower()

  sentence = re.sub("(\.\.+)", " ", str(sentence)).lower()

  sentence = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", " ", str(sentence)).lower()
  sentence = re.sub(r"(\\x9\d)", " ", str(sentence)).lower()

  sentence = re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", "CM_NUM", str(sentence)).lower()
  sentence = re.sub("(\.\s+)", " ", str(sentence)).lower()
  sentence = re.sub("(\-\s+)", " ", str(sentence)).lower()
  sentence = re.sub("(\:\s+)", " ", str(sentence)).lower()
  sentence = re.sub("(\s+)", " ", str(sentence)).lower()
  
  return sentence

In [109]:
df['cleaned_text'] = df.text.apply(lambda x: text_strip(x))
df['cleaned_headlines'] = df.headlines.apply(lambda x: '_START_ '+ text_strip(x) + ' _END_')
df['cleaned_headlines'] = df['cleaned_headlines'].apply(lambda x: 'sostok ' + x + ' eostok')

df = df[((df.cleaned_text.str.split().str.len()<=config['max_text_len']) & (df.cleaned_headlines.str.split().str.len()<=(config['max_summary_len']+4)))].copy()
df = df.reset_index(drop=True)

df = df.drop(['text', 'headlines'], axis=1)
df = df.rename(columns = {'cleaned_text':'text', 'cleaned_headlines':'summary'})
                                      

In [None]:

X = df['text']
Y = df['summary']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [None]:
def get_rare_words(text_col):

  text_tokenizer = Tokenizer()    
  text_tokenizer.fit_on_texts(list(text_col))

  thresh = 5

  cnt = 0
  tot_cnt = 0

  for key, value in text_tokenizer.word_counts.items():
      tot_cnt = tot_cnt + 1
      if value < thresh:
          cnt = cnt + 1

  print("% of rare words in vocabulary:",(cnt / tot_cnt) * 100)
  
  return cnt, tot_cnt

x_train_cnt, x_train_tot_cnt = get_rare_words(text_col=x_train)
y_train_cnt, y_train_tot_cnt = get_rare_words(text_col=y_train)


In [112]:
x_tokenizer = Tokenizer(num_words=x_train_tot_cnt - x_train_cnt) 

x_tokenizer.fit_on_texts(list(x_train))

x_tr_seq = x_tokenizer.texts_to_sequences(x_train) 
x_val_seq = x_tokenizer.texts_to_sequences(x_test)

x_tr = pad_sequences(x_tr_seq,  maxlen=config['max_text_len'], padding='post')
x_val = pad_sequences(x_val_seq, maxlen=config['max_text_len'], padding='post')

x_voc = x_tokenizer.num_words + 1


In [113]:
y_tokenizer = Tokenizer(num_words=y_train_tot_cnt - y_train_cnt) 
y_tokenizer.fit_on_texts(list(y_train))

y_tr_seq = y_tokenizer.texts_to_sequences(y_train) 
y_val_seq = y_tokenizer.texts_to_sequences(y_test) 

y_tr = pad_sequences(y_tr_seq, maxlen=config['max_summary_len'], padding='post')
y_val = pad_sequences(y_val_seq, maxlen=config['max_summary_len'], padding='post')

y_voc = y_tokenizer.num_words + 1

In [114]:
config = {'min_text_len':40,
          'max_text_len':60,
          'max_summary_len':30,
          'latent_dim' : 300,
          'embedding_dim' : 200}


latent_dim = config['latent_dim']
embedding_dim = config['embedding_dim']
max_text_len = config['max_text_len']
max_summary_len = config['max_summary_len']

ENCODER

In [115]:

encoder_inputs = Input(shape=(max_text_len, ))

enc_emb = Embedding(input_dim = x_voc, output_dim = embedding_dim, trainable=True)(encoder_inputs)

encoder_lstm1 = LSTM(units = latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)

encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)


encoder_lstm2 = LSTM(latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)

encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

encoder_lstm3 = LSTM(latent_dim, return_state=True,
                     return_sequences=True, dropout=0.4,
                     recurrent_dropout=0.4)

encoder_outputs, state_h, state_c = encoder_lstm3(encoder_output2)

DECODER

In [116]:
decoder_inputs = Input(shape=(None, )) 

dec_emb_layer = Embedding(y_voc, embedding_dim, trainable=True)  
dec_emb = dec_emb_layer(decoder_inputs) 

decoder_lstm = LSTM(latent_dim, return_sequences=True,
                    return_state=True, dropout=0.4,
                    recurrent_dropout=0.2)

(decoder_outputs, decoder_fwd_state, decoder_back_state) = decoder_lstm(dec_emb, initial_state=[state_h, state_c]) 

decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [117]:
model.summary()

In [None]:
model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model_name = "model.weights.h5"

save_model = tf.keras.callbacks.ModelCheckpoint(filepath=model_name,
                                                save_weights_only=True,
                                                save_best_only=True,
                                                verbose=1)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

history = model.fit(
    [x_tr, y_tr[:, :-1]],
    y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)[:, 1:],
    epochs=70,
    callbacks=[es, save_model],
    batch_size=1024,
    validation_data=([x_val, y_val[:, :-1]],
                     y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:, 1:]),
    )

model.load_weights('./model.weights.h5')

In [119]:
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs,
                      state_h, state_c])

decoder_state_input_h = Input(shape=(latent_dim, ))
decoder_state_input_c = Input(shape=(latent_dim, ))
decoder_hidden_state_input = Input(shape=(max_text_len, latent_dim))

dec_emb2 = dec_emb_layer(decoder_inputs)

(decoder_outputs2, state_h2, state_c2) = decoder_lstm(dec_emb2,
        initial_state=[decoder_state_input_h, decoder_state_input_c])

decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input,
                      decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2] + [state_h2, state_c2])

In [120]:
def decode_sequence(input_seq):
 
    (e_out, e_h, e_c) = encoder_model.predict(input_seq, verbose=0)
    target_seq = np.zeros((1, 1))

    target_seq[0, 0] = target_word_index['sostok'] 

    stop_condition = False
    decoded_sentence = '' 

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq]
                + [e_out, e_h, e_c], verbose=0)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        if sampled_token == 'eostok' or len(decoded_sentence.split()) \
            >= max_summary_len - 1:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        (e_h, e_c) = (h, c)

    return decoded_sentence

In [121]:
reverse_source_word_index = x_tokenizer.index_word
reverse_target_word_index = y_tokenizer.index_word
target_word_index = y_tokenizer.word_index

In [122]:
def seq2text(input_seq):

    newString = ''
    for i in input_seq:
        if i != 0: 
            newString = newString + reverse_source_word_index[i] + ' '

    return newString

def seq2summary(input_seq):

    newString = ''
    for i in input_seq:
        if (i != 0) and (i != target_word_index['sostok']) and (i != target_word_index['eostok']):
            newString = newString + reverse_target_word_index[i] + ' '

    return newString

In [None]:
actual = []
predicted = []
for i in range(0, 50):
    print ('Review:', seq2text(x_tr[i]))
    
    actual.append(seq2summary(y_tr[i]))
    print ('Original summary:', actual[-1])
    
    predicted.append(decode_sequence(x_tr[i].reshape(1, config['max_text_len'])))
    print ('Predicted summary:', predicted[-1])
    print()
prediction_df = pd.DataFrame({'Actual':actual, 'Predicted':predicted})

    


Review: a man has been caught taking videos of girls at the school arts festival organised in kerala s thrissur by cutting a hole into his slipper and fitting a phone camera in it the police who arrested the accused after noticing his suspicious movements said that he went through the crowds trying to take photos of women from below 
Original summary: start kerala man caught taking videos on camera end 
Predicted summary:  start man who stole up to remove hiv at delhi metro end

Review: the cbi on friday arrested the key accused in the 24 year old rss madras headquarters bomb blast from the outskirts of chennai the prime accused in the case mushtaq ahmed was absconding since the 1993 blast that claimed 11 lives ahmed had allegedly procured the explosive material for assembling the bomb and provided shelter to other accused persons 
Original summary: start cbi arrests prime accused in rss madras headquarters blast end 
Predicted summary:  start cbi arrests ex j k police officer for murd