## Load Libraries

In [1]:
#!pip install -q ktext
#!pip install -q annoy

In [2]:
import json
from urllib.request import urlopen

%matplotlib inline
from matplotlib import pyplot as plt

#from annoy import AnnoyIndex
from keras import optimizers
from keras.layers import Input, Dense, LSTM, GRU, Embedding, Lambda, BatchNormalization, Bidirectional
from keras.models import load_model, Model
from keras import optimizers
from keras.preprocessing.sequence import pad_sequences
from keras.utils import get_file, to_categorical
from keras.callbacks import TensorBoard, ModelCheckpoint, CSVLogger
from ktext.preprocess import processor
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load Data

In [3]:
# News Dataset
news = pd.read_csv("F:/HOYA/590/Project/Data_All/articles20k_input.csv", encoding='latin-1')
news.columns = ['title', 'body']

In [4]:
# News Dataset
news = pd.read_csv("F:/HOYA/590/Project/Data_All/articles20k_input.csv", encoding='utf-8')
news.columns = ['title', 'body']

In [5]:
traindf, testdf = train_test_split(news, test_size=.010)
print(traindf.shape)
print(testdf.shape)

(19800, 2)
(200, 2)


In [6]:
source_docs = list(traindf.body)
target_docs = list(traindf.title)

In [7]:
test_source_docs = list(testdf.body)
test_target_docs = list(testdf.title)

## Sequence to Sequence Model

In [8]:
print('source: ', source_docs[4])
print('target: ', target_docs[4])

source:  It has been a rough year. By now, our violence is down to a pattern, and there is a choreography to our reactions. A killer seeks out a nightclub, a church, an airport, a courthouse, a protest. Someone is shot on video, sometimes by the police, and marchers fill the streets. An attack is carried out in France, America, Turkey, Bangladesh, Lebanon, Tunisia, Nigeria, and then claimed and celebrated by a radical terror group. Our phones vibrate with news alerts. The talking heads fill air over cable news captions that shout âbreaking newsâ in red. Rumors and misinformation abound. The comments erupt on Twitter, Facebook and news sites. Journalists create multimedia stories that focus on videos, photos and graphic accounts from victims and witnesses. The experts give interviews, and the latest tools of immediacy are put to use. After thedeadly terror attack in Nice, France, The Times invited grief counselors to be interviewed on Facebook Live. Within days, attention had turned

### Pre-Processing

In [9]:
source_proc = processor(hueristic_pct_padding=.7, keep_n=8000)
source_vecs = source_proc.fit_transform(source_docs)

 See full histogram by insepecting the `document_length_stats` attribute.


In [10]:
print('\noriginal string:\n', source_docs[0], '\n')
print('after pre-processing:\n', source_vecs[0], '\n')


original string:
 PHILADELPHIA  â   Forty times, city or state governments had proposed taxes on sugary soft drinks, failing each time. Then, in 2014, liberal Berkeley, Calif. passed such a tax, but most people saw it as an aberration. Several measures, including one in New York, never won much support. But on Thursday, a measure to tax sweetened drinks passed in Philadelphia, one of the countryâs largest cities  â   and also one of its poorest. Indeed, raising revenue was the winning argument in Philadelphia. Jim Kenney, the mayor, took a different tack from that of politicians who have tried and failed to pass   taxes. He didnât talk about the tax as a   measure designed to discourage   soft drinks. And he didnât promise to earmark the proceeds for health programs. Instead, he cast the soft drink industry as a tantalizing revenue source that could be tapped to fund popular city programs, including universal prekindergarten. âThis is the beginning of a process of changing

    2 3197    4    2  747  240   35  970   11   15] 



In [11]:
%reload_ext autoreload
%autoreload 2
from ktext.preprocess import processor

In [12]:
target_proc = processor(append_indicators=True, hueristic_pct_padding=.7, keep_n=4500, padding ='post')
target_vecs = target_proc.fit_transform(target_docs)

 See full histogram by insepecting the `document_length_stats` attribute.


In [13]:
encoder_input_data = source_vecs
encoder_seq_len = encoder_input_data.shape[1]

decoder_input_data = target_vecs[:, :-1]
decoder_target_data = target_vecs[:, 1:]

num_encoder_tokens = max(source_proc.id2token.keys()) + 1
num_decoder_tokens = max(target_proc.id2token.keys()) + 1

### Encoder Model

In [14]:
word_emb_dim=512
hidden_state_dim=1024
encoder_seq_len=encoder_seq_len
num_encoder_tokens=num_encoder_tokens
num_decoder_tokens=num_decoder_tokens

#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 512

In [15]:
encoder_inputs = Input(shape=(encoder_seq_len,), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

# Intermediate GRU layer (optional)
#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)
#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)

# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU', return_sequences=True)(x)

# Encapsulate the encoder as a separate entity so we can just 
#  encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

In [16]:
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder-Input (InputLayer)   (None, 920)               0         
_________________________________________________________________
Body-Word-Embedding (Embeddi (None, 920, 512)          4097024   
_________________________________________________________________
Encoder-Batchnorm-1 (BatchNo (None, 920, 512)          2048      
_________________________________________________________________
Encoder-Last-GRU (GRU)       [(None, 920, 512), (None, 1574400   
Total params: 5,673,472
Trainable params: 5,672,448
Non-trainable params: 1,024
_________________________________________________________________


### Decoder Model

In [17]:
decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder (ex: Issue Titles)
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

### Seq2Seq Model

In [18]:
seq2seq_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
seq2seq_model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

In [19]:
seq2seq_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 512)    2305024     Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, 920)          0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 512)    2048        Decoder-Word-Embedding[0][0]     
__________________________________________________________________________________________________
Encoder-Mo

### Training

In [20]:
script_name_base = 'seq2seq_v2'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 1200
epochs = 7
history = seq2seq_model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

Train on 17424 samples, validate on 2376 samples
Epoch 1/7


ResourceExhaustedError: OOM when allocating tensor with shape[1200,512] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[Node: Encoder-Model/Encoder-Last-GRU/while/mul_1 = Mul[T=DT_FLOAT, _class=["loc:@train...ad/Reshape"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Encoder-Model/Encoder-Last-GRU/while/mul/x, Encoder-Model/Encoder-Last-GRU/while/add_3)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [None]:
#save model
seq2seq_model.save('seq2seq_model.h5')

### Extract Encoder and Decoder Models

In [None]:
def extract_decoder_model(model):
    latent_dim = model.get_layer('Encoder-Model').output_shape[-1]
    decoder_inputs = model.get_layer('Decoder-Input').input
    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
    decoder_model = Model([decoder_inputs, gru_inference_state_input], [dense_out, gru_state_out])
    return decoder_model

In [None]:
encoder_model = seq2seq_model.get_layer('Encoder-Model')
for layer in encoder_model.layers:
    layer.trainable = False

decoder_model = extract_decoder_model(seq2seq_model)
decoder_model.summary()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training and Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.show()

### Predict code descriptions using the trained sequence-to-sequence model

In [None]:
col_names =  ['body', 'actual_title', 'generated_title']
resultDF  = pd.DataFrame(columns = col_names)
test_idx = list(range(1, len(test_source_docs)))
max_len = target_proc.padding_maxlen

for i in test_idx:
  
  raw_input_text = test_source_docs[i]
  raw_tokenized = source_proc.transform([raw_input_text])
  encoding = encoder_model.predict(raw_tokenized)
  original_encoding = encoding
  state_value = np.array(target_proc.token2id['_start_']).reshape(1, 1)
  
  decoded_sentence = []
  stop_condition = False
  while not stop_condition:
    preds, st = decoder_model.predict([state_value, encoding])
    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = target_proc.id2token[pred_idx]

    if pred_word_str == '_end_' or len(decoded_sentence) >= max_len:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)
    
    # update the decoder for the next word
    encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)
    
  pred_sent = ' '.join(decoded_sentence)
    
  
  # print
  print('\n\n==============================================')
  print(f'============== News # {i} =================\n')
  print('News Body:\n------------------------\n', raw_input_text)
  print('\nNews Headline:\n------------------------\n', test_target_docs[i])
  print('\nMachine-generated Headline:\n------------------------\n', pred_sent)
    
    
  # append to dataframe
  tmpDF = pd.DataFrame({"body": [raw_input_text], 
                        "actual_title": [test_target_docs[i]], 
                        "generated_title": [pred_sent]})
  resultDF = pd.concat([resultDF, tmpDF], ignore_index=True)

  
resultDF.to_csv('News.csv', encoding='latin-1')