# Install packages

In [None]:
#!pip3 install tensorflow==1.10
#!pip3 install tensorflow-gpu==1.10.0
#!pip3 install nltk
#!pip3 install numpy
#!pip3 install pandas
#!pip3 install gensim

# Import dependencies

In [None]:
import nltk
from os import listdir
import time
from nltk.corpus import wordnet
import re
import pickle
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.python.layers.core import Dense

# Config & Hyper Parameters

In [95]:
#base_path = 'G:\\AI\\data\\cnn\\'
base_path = 'data\\'
path = base_path + 'sample_5k\\'
#path = base_path + 'stories\\'
articles_pickle_filename = "articles.pickle"
headlines_pickle_filename = "headlines.pickle"

''' https://fasttext.cc/docs/en/english-vectors.html 
    or https://www.kaggle.com/yesbutwhatdoesitmean/wikinews300d1mvec'''
model_path = 'model\\'
model_org_filename = 'wiki-news-300d-1M.vec'
model_pickle_filename = "model.pickle"
word_embedding_matrix_filename = "word_embedding_matrix.pickle"

# to avoid words that are used less that threshold value
threshold = 2

# Dimension size as per pre-trained data
embedding_dim = 300
max_text_length = 500
max_summary_length = 20
min_length = 2
unk_text_limit = 100

# Set the Hyperparameters
epochs = 100
batch_size = 32
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75
beam_width = 3

# Training Hyperparameters
start = 0
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 10  # Check training loss after every 10 batches
stop_early = 0
stop = 3  # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3  # Make 3 update checks per epoch


# Stopword list and Initialize Lemmatizer

In [None]:
nltk.download('stopwords')

stop_words = nltk.corpus.stopwords.words('english')
lmtzr = nltk.WordNetLemmatizer().lemmatize


# Read files and load into memory

In [None]:
def load_files(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# Split a document into news article body and headlines

In [None]:
def split_data(doc):
    # find first headlines
    index = doc.find('@highlight')
    # split into story and headlines
    article, headlines = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    headlines = [h.strip() for h in headlines if len(h) > 0]
    return article, headlines

# Clean a list of lines
This section is used to remove unwanted words and return cleaned articles and headlines.

In [None]:
def clean_text(lines, remove_stopwords=True):
    
    cleaned = list()
    for line in lines:
        # strip source cnn office if it exists
        index = line.find('(CNN)  -- ')
        if index > -1:
            line = line[index + len('(CNN)'):]
        else:
            index = line.find('(CNN)')
            if index > -1:
                line = line[index + len('(CNN)'):]

        # tokenize on white space
        line = line.split()

        # convert to lower case
        line = [word.lower() for word in line]

        # Optionally, remove stop words
        if remove_stopwords:
            line = [w for w in line if w not in stop_words]

        # remove punctuation from each token
        #line = [w.translate(table) for w in line]

        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]

        # Format words and remove unwanted characters
        text = " ".join(line)
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text)
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)

        # remove empty strings
        if len(text )> 0 :
            cleaned.append(text)

    return cleaned

# Normalization of data using Lemmatization
Lemmatization is used as it returns better words choice than stemming as Lemmatization returns only valid dictionary(wordnet) words. Trade is it takes more time.

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def normalize_text(text):
    cleaned = list()

    for line in text :
        word_pos = nltk.pos_tag(nltk.word_tokenize(line))
        lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]

        word = [x.lower() for x in lemm_words]
        cleaned.append(' '.join(word))

    return cleaned

# Load all stories in a directory
This is used to load and clean the learn and test dataset. After cleaning data it returns two list cleaned articles and cleaned headlines.

In [None]:
def load_stories(location):
    stories = list()
    file_list = listdir(location)
    total_files = len (file_list)
    count = 0
    print ("Total Files : {total_files}".format(total_files= total_files))
    clean_articles = []
    clean_headlines = []
    for name in file_list:
        count = count + 1
        filename = location + '/' + name
        # load document
        print('Loading  - {filename}, files number  - {count},  out of - {total_files}'
              .format(filename=filename, count=count, total_files=total_files))
        doc = load_files(filename)
        # split into story and highlights
        article, headlines = split_data(doc)
        # store
        #stories.append({'article': article, 'headlines' : headlines})

        article = clean_text(article.split('\n'))
        article = normalize_text(article)
        clean_articles.append(' '.join(article))
        headlines = clean_text(headlines, remove_stopwords=False)
        headlines = normalize_text(headlines)
        clean_headlines.append(' '.join(headlines))

    return clean_articles, clean_headlines

# Main Program
Start point of data cleaning, once the articles and headlines are cleaned, they dumped so that can be reused for vectorization and then running model directly. This is becasue cleaning is an expensive operation in terms of time and resources. 

In [None]:
def main():
    start = time.perf_counter()
    clean_articles, clean_headlines = load_stories(path)
    print("Total Articles  : {len_articles} , Total Headlines : {len_headlines}- Time Taken : {time_taken}"
          .format(len_articles=len(clean_articles), len_headlines =
                  len(clean_headlines), time_taken = (time.perf_counter()-start)/60))

    print ("Serialization of articles")
    # Store Articles (serialize)
    with open(base_path + articles_pickle_filename, 'wb') as handle:
        pickle.dump(clean_articles, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Serialization of headlines")
    # Store Articles (serialize)
    with open(base_path + headlines_pickle_filename, 'wb') as handle:
        pickle.dump(clean_headlines, handle, protocol=pickle.HIGHEST_PROTOCOL)


'''-------------------------main------------------------------'''
main()


# Load Pre-trained English word embedding

This is used to load pre-trained english word embedding 'fast text' provided facebook. First it checks if pre-trained model dump already exists, if not it load model and put in it dump. Dump is created becasue it loads faster than actual word embedding model.
https://fasttext.cc/docs/en/english-vectors.html

In [None]:
def create_or_load_model():
    model_pickle = Path(model_path + model_pickle_filename)
    if model_pickle.exists():
        print("Loading Pre-Trained Model Pickle..... ")
        start = time.perf_counter()
        with open(model_path + model_pickle_filename, 'rb') as handle:
            model = pickle.load(handle)
        print("Loaded Pre-Trained Model Pickle, time taken", ((time.perf_counter() - start) / 60))
    else:
        print("Loading Pre-Trained Model  ..... ")
        start = time.perf_counter()
        model = KeyedVectors.load_word2vec_format(model_path + model_org_filename, binary=False)
        with open(base_path + model_pickle_filename, 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print("Loaded Pre-Trained Model, time taken", ((time.perf_counter() - start) / 60))
    return model


# count_words

This is a utility method used to count how many times a word is used. 

In [None]:
def count_words(count_dict, text):
    ''' Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

# vectorization

This is used to get word embedding for each word from pre-trained model

In [None]:
def vectorization(text, embeddings_index, model):
    for sentence in text:
        try:
            for vocab_word in sentence.split():
                embeddings_index[vocab_word] = model[vocab_word]              
                # print("Work : {vocab_word} , vector value : {vector_value}".
                #format(vocab_word=vocab_word, vector_value =vector_value))
        except KeyError:
            '''ignore'''
            # print("{vocab_word} not in vocabulary".format(vocab_word=vocab_word))

# missing_word_ratio

Find the number of words that are missing from CN, and are used more than our threshold.

In [None]:
def missing_word_ratio(word_counts, embeddings_index):
    ''' Find the number of words that are missing from CN, and are used more than our threshold.'''
    missing_words_count = 0
    missing_words = list()

    for word, count in word_counts.items():
        if word not in embeddings_index and word not in missing_words and count > threshold:
            missing_words_count += 1
            missing_words.append(word)
            # print("{word} is missing ".format(word=word))

    missing_ratio = round(missing_words_count / len(word_counts), 4) * 100
    return missing_ratio, missing_words_count

# covert_vocab_to_int

This is used to covert each word in training set to word vector. This is important as ML algorithm can only understand numbers. This integer representation of word is later passed encoder for word processing.

In [None]:
def covert_vocab_to_int(word_counts, embeddings_index):
    # dictionary to convert words to integers
    vocab_to_int = {}

    value = 0
    for word, count in word_counts.items():
        if count > threshold or word in embeddings_index:
            vocab_to_int[word] = value
            value += 1

    # Special tokens that will be added to our vocab
    codes = ["<UNK>", "<PAD>", "<EOS>", "<GO>"]

    # Add codes to vocab
    for code in codes:
        vocab_to_int[code] = len(vocab_to_int)

    # Dictionary to convert integers to words
    int_to_vocab = {}
    for word, value in vocab_to_int.items():
        int_to_vocab[value] = word

    usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100

    print("Total number of unique words:", len(word_counts))
    print("Number of words we will use:", len(vocab_to_int))
    print("Percent of words we will use: {}%".format(usage_ratio))

    return vocab_to_int

# create_combine_word_matrix

Need to use 300 for embedding dimensions to match corpus(input data) vectors.
This will return cobine matriz that would have 'embeddings_index' for from pre-trained word embedding plus 
random embedding generated for words missing in pre-trained word embedding.

In [None]:
def create_combine_word_matrix(vocab_to_int, embeddings_index):
    
    nb_words = len(vocab_to_int)
    # Create matrix with default values of zero
    word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
    for word, i in vocab_to_int.items():
        if word in embeddings_index:
            word_embedding_matrix[i] = embeddings_index[word]
        else:
            # If word not in CN, create a random embedding for it
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            embeddings_index[word] = new_embedding
            word_embedding_matrix[i] = new_embedding

    # Check if value matches len(vocab_to_int)
    print("word_embedding_matrix length : ", len(word_embedding_matrix))
    return word_embedding_matrix

# Finding unknown words

Convert words in text to an integer. If word is not in vocab_to_int, use UNK's integer.
Total the number of words and UNKs. Add EOS token to the end of texts.

In [None]:
def convert_to_ints(text, vocab_to_int, eos=False):    
    ints = []
    word_count = 0
    unk_count = 0
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                # print("UNK Word : ", word)
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)

    unk_percent = round(unk_count / word_count, 4) * 100

    print("Total number of words : ", word_count)
    print("Total number of UNKs : ", unk_count)
    print("Percent of words that are UNK: {}%".format(unk_percent))

    return ints, word_count, unk_count


def create_dataFrame(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])


def unk_counter(sentence, vocab_to_int):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

# Sorting training dataset

Sort the summaries and texts by the length of the texts, shortest to longest. 
This is required so that batch provided to tensorflow will have lesser padding as sentences would be of same size.
Limit the length of summaries and texts based on the min and max ranges. This is to avoid out of range data.
Remove reviews that include too many UNKs as they would not provide much of learning experience.

In [None]:
def sort_corplus(lengths_articles, int_rep_articles, int_rep_headlines, vocab_to_int):

    sorted_articles = []
    sorted_headlines = []
    #max_text_length = max_text_length
    #max_summary_length = max_summary_length
    #min_length = config.min_length
    #unk_text_limit = config.unk_text_limit
    unk_summary_limit = 0

    for count, words in enumerate(int_rep_articles):
        if (len(int_rep_articles[count]) >= min_length and len(int_rep_articles[count]) <= max_text_length
            and unk_counter(int_rep_headlines[count], vocab_to_int) <= unk_summary_limit and
                    unk_counter(int_rep_articles[count], vocab_to_int) <= unk_text_limit):
            sorted_headlines.append(int_rep_headlines[count])
            sorted_articles.append(int_rep_articles[count])

    # Compare lengths to ensure they match
    print(len(sorted_headlines))
    print(len(sorted_articles))

    return sorted_articles, sorted_headlines

# Create input for Tensorflow graph

For using tensorflow we need to provide below input paramters and create_input_for_graph() is used to generate these variables.

clean_articles -> articles after removing impurities

sorted_articles -> articles sorted as the thr length

sorted_headline -> headlines (sorted as per article length) as the thr length

vocab_to_int -> interger values of all vocab words

word_embedding_matrix -> 300 dim matrix for each word in vocab

In [None]:
def create_input_for_graph():
    # Load data (deserialize)
    with open(base_path + articles_pickle_filename, 'rb') as handle:
        clean_articles = pickle.load(handle)

    with open(base_path + headlines_pickle_filename, 'rb') as handle:
        clean_headlines = pickle.load(handle)

    pre_trained_model = create_or_load_model()

    word_counts = {}
    print("counting  Articles")
    count_words(word_counts, clean_articles)
    print("counting  Headlines")
    count_words(word_counts, clean_headlines)

    print("Total Stories : ", len(clean_headlines))
    print("Size of Vocabulary:", len(word_counts))

    print("creating embedding index .....")
    embeddings_index = {};
    vectorization(clean_articles, embeddings_index, pre_trained_model)
    vectorization(clean_headlines, embeddings_index, pre_trained_model)
    print('Word embeddings:', len(embeddings_index))

    # find out missing words and thr %
    missing_ratio, missing_words_count = missing_word_ratio(word_counts, embeddings_index)

    print("Number of words missing :", missing_words_count)
    print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

    '''dictionary to convert words to integers - This is to found total words count that 
    we get from aur corpus(input date) and out of that what % of words we would be using. 
    This is after removing words that count less than threshold'''
    vocab_to_int = covert_vocab_to_int(word_counts, embeddings_index)

    word_embedding_matrix = create_combine_word_matrix(vocab_to_int, embeddings_index)

    # Apply convert_to_ints to clean_articles and clean_headlines
    print("Article Data")
    int_repr_articles, word_article_count, unk_article_count = convert_to_ints(clean_articles, 
                                                                               vocab_to_int, eos=True)

    print("Headline Data")
    int_repr_headlines, word_headline_count, unk_headline_count = convert_to_ints(clean_headlines,
                                                                                  vocab_to_int)

    lengths_articles = create_dataFrame(int_repr_articles)
    # lengths_headlines = create_dataFrame(int_repr_headlines)

    sorted_articles, sorted_headlines = sort_corplus(lengths_articles, int_repr_articles,
                                                     int_repr_headlines, vocab_to_int)

    return sorted_articles, sorted_headlines, vocab_to_int, word_embedding_matrix

# Define placeholders

In [79]:
def model_inputs():
    '''Create palceholders for inputs to the model'''

    input_data = tf.placeholder(tf.int32, [None, None], name='input_data')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    headline_length = tf.placeholder(tf.int32, (None,), name='headline_length')
    max_headline_length = tf.reduce_max(headline_length, name='max_headline_length')
    article_length = tf.placeholder(tf.int32, (None,), name='article_length')

    return input_data, targets, lr, keep_prob, headline_length, max_headline_length, article_length

In [71]:
def process_encoding_input(target_data, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    # sample = [[11, 12, 13], [31, 32, 33], [51, 52, 53], [61, 62,63]]
    # slice = tf.strided_slice(sample, begin=[0,0], end=[4,4], strides=[1,1])
    # process_input = tf.concat([tf.fill([4, 1], 9999), slice], 1)
    # process_input = [[9999   11   12   13], [9999   31   32   33] , [9999   51   52   53], [9999   61   62   63]]

    # target data has batch_size rows, -1 means everything, so first elect of each row
    slice = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])

    # tf.fill creates array of batch_size X 1 and then fill in value of '<GO>'
    # create matrix that has first column as value vocab_to_int['<GO>'] and second as index [first column of each row)
    process_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), slice], 1)

    return process_input

In [80]:
def encoding_layer(rnn_size, article_length, num_layers, rnn_inputs, keep_prob):
    '''Create the encoding layer'''

    # Number of layer inside neural network
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):

            # forward direction cell with random weights with seed value for reproduce random value
            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))

            # Dropout to kills cells that are not changing.
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=keep_prob)

            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=keep_prob)

            # Bidirectional as it is more optimized, spl with Dropouts
            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, rnn_inputs, article_length, dtype=tf.float32)

    # Join outputs since we are using a bidirectional RNN
    enc_output = tf.concat(enc_output, 2)

    return enc_output, enc_state

In [81]:
def train_decoding_layer(dec_embed_input, headline_length, dec_cell, initial_state, output_layer, max_headline_length):
    '''Create the training logits'''

    # for training : read inputs from dense ground truth vector
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=headline_length,
                                                        time_major=False)

    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                       training_helper,
                                                       initial_state,
                                                       output_layer)

    training_logits, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, impute_finished=True,
                                                              output_time_major=False,
                                                              maximum_iterations = max_headline_length)

    return training_logits

In [82]:
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_headline_length, batch_size):
    '''Create the inference logits'''

    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    '''
    # For Basic decoder
    # GreedyEmbeddingHelper - > Select top probability output
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                start_tokens,
                                                                end_token)

    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        initial_state,
                                                        output_layer)

    inference_logits, _, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                               output_time_major=False,
                                                               impute_finished=True,
                                                               maximum_iterations=max_headline_length)
    '''
    beam_initial_state = dec_cell.zero_state(batch_size * beam_width, tf.float32)

    inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
        cell=dec_cell,
        embedding=embeddings,
        start_tokens=start_tokens,
        end_token=end_token,
        initial_state=beam_initial_state,
        beam_width=beam_width,
        output_layer=output_layer,
        length_penalty_weight=0.0)

    inference_logits, _, _ = tf.contrib.seq2seq.dynamic_decode(
        decoder=inference_decoder,
        impute_finished=False,
        maximum_iterations=2 * max_headline_length)

    return inference_logits


In [83]:
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size,  article_length, headline_length,
                   max_headline_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
    '''Create the decoding cell and attention for the training and inference decoding layers'''

    # creating layer and Dropout layers
    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)

    # creating Dense- This is also called output layer. This will produce the summary.
    output_layer = Dense(vocab_size, activation='relu', kernel_initializer =
        tf.truncated_normal_initializer(mean=0.0, stddev=0.1))

    # Using BahdanauAttention as one of the widely used Attention Algorithms
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, article_length,
                                                     normalize=False, name='BahdanauAttention')

    dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attn_mech, rnn_size)
    initial_state = dec_cell.zero_state(batch_size=batch_size, dtype=enc_state[0].dtype).clone(cell_state=enc_state[0])

    # Creating training logits - which would be used during training dataset
    with tf.variable_scope("decode"):
        training_logits = train_decoding_layer(dec_embed_input,
                                               headline_length,
                                               dec_cell,
                                               initial_state,
                                               output_layer,
                                               max_headline_length)

    # Creating inference logits - which would produce output using train model
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,
                                                    vocab_to_int['<GO>'],
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell,
                                                    initial_state,
                                                    output_layer,
                                                    max_headline_length,
                                                    batch_size)

    return training_logits, inference_logits

In [76]:
def seq2seq_model(input_data, target_data, keep_prob, article_length, headline_length, max_headliney_length,
                      vocab_size, rnn_size, num_layers, vocab_to_int, batch_size, word_embedding_matrix):
    '''Use the previous functions to create the training and inference logits'''

    # Use fasttext's embeddings and the newly created ones as our embeddings
    embeddings = word_embedding_matrix

    # embedding_lookup returns embedding values of input_data that we have provided
    print("Geting embedding for encoder input")
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)

    # Define encoder layers - with respect to size of neurons, hidden layers and design (such as bi-directional)
    print("Initializing encoder layers")
    enc_output, enc_state = encoding_layer(rnn_size, article_length, num_layers, enc_embed_input, keep_prob)

    print("Adding 'GO' to start text")
    dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)

    print("Getting embedding for encoder input")
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)

    print("Getting decoding_layer logits ... ")
    # Train: Learn model parameters.
    # Inference: Apply model on unseen data to assess performance.
    training_logits, inference_logits = decoding_layer(dec_embed_input,
                                                       embeddings,
                                                       enc_output,
                                                       enc_state,
                                                       vocab_size,
                                                       article_length,
                                                       headline_length,
                                                       max_headliney_length,
                                                       rnn_size,
                                                       vocab_to_int,
                                                       keep_prob,
                                                       batch_size,
                                                       num_layers)

    return training_logits, inference_logits

In [84]:
def build_graph(vocab_to_int, word_embedding_matrix):
    # Build the graph
    train_graph = tf.Graph()
    # Set the graph to default to ensure that it is ready for training
    with train_graph.as_default():
        # Load the model inputs
        print("Load input parameter ...")
        input_data, targets, lr, keep_prob, headline_length, max_headline_length, \
            article_length = model_inputs()

        # Create the training and inference logits
        print("Create instance of seq2seq model parameter ...")

        # training_logits gives us matrix of possibilities when we trained the system whereas
        # inference_logits are used when we are trying to predict summary out of it.
        training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                          targets,
                                                          keep_prob,
                                                          article_length,
                                                          headline_length,
                                                          max_headline_length,
                                                          len(vocab_to_int) + 1,
                                                          rnn_size,
                                                          num_layers,
                                                          vocab_to_int,
                                                          batch_size,
                                                          word_embedding_matrix)

        # Create tensors for the training logits and inference logits
        training_logits = tf.identity(training_logits.rnn_output, 'logits')

        # inference_logits would be used while predicting the summary
        # used for basic decoder
        # inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
        inference_logits = tf.identity(inference_logits.predicted_ids, name='predictions')

        # Create the weights for sequence_loss
        masks = tf.sequence_mask(headline_length, max_headline_length, dtype=tf.float32, name='masks')

        with tf.name_scope("optimization"):
            # Loss function
            cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)

            # Optimizer
            optimizer = tf.train.AdamOptimizer(learning_rate)

            # Gradient Clipping
            gradients = optimizer.compute_gradients(cost)
            capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var)
                                for grad, var in gradients if grad is not None]
            train_op = optimizer.apply_gradients(capped_gradients)
    print("Graph is built.")
    # input_data, targets, lr, keep_prob, headline_length, max_headline_length, article_length
    return train_graph, train_op, cost, input_data, targets, lr, keep_prob, \
           headline_length, max_headline_length, article_length


In [89]:
# This could later be improved as tensorflow provide that put padding by it owns.
def pad_sentence_batch(sentence_batch, vocab_to_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    padded_batch  = [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence))
                     for sentence in sentence_batch]
    # print ("padded  ==== > ", padded_batch)
    return padded_batch

In [90]:
def get_batches(headlines, articles, batch_size, vocab_to_int):
    """Batch headlines, articles, and the lengths of their sentences together"""
    for batch_i in range(0, len(articles) // batch_size):
        start_i = batch_i * batch_size
        headlines_batch = headlines[start_i:start_i + batch_size]
        articles_batch = articles[start_i:start_i + batch_size]
        pad_headlines_batch = np.array(pad_sentence_batch(headlines_batch, vocab_to_int))
        pad_articles_batch = np.array(pad_sentence_batch(articles_batch, vocab_to_int))

        # Need the lengths for the _lengths parameters
        pad_headlines_lengths = []
        for headline in pad_headlines_batch:
            pad_headlines_lengths.append(len(headline))

        pad_articles_lengths = []
        for article in pad_articles_batch:
            pad_articles_lengths.append(len(article))

        yield pad_headlines_batch, pad_articles_batch, pad_headlines_lengths, pad_articles_lengths

In [100]:
def train_model(train_graph, train_op, cost, gen_input_data, gen_targets, gen_lr, gen_keep_prob,
                gen_headline_length, gen_max_headline_length, gen_article_length,
                sorted_headlines_short, sorted_articles_short, vocab_to_int):
    # Record the update losses for saving improvements in the model
    headlines_update_loss = []

    # name given to checkpoint
    checkpoint = "best_model.ckpt"

    # This make sures that in one epoch it only checked as per value specified of per_epoch
    # e.g if length of article is 4000 the => 4000 / 32 (bath size) = > 125
    # (it means we will have 125 loops in 1 epoch)  then 125 / 3 - 1 = 40
    # (so while covering 125 iteration per epoch after 40 iteration
    # it will check and print the loss)
    update_check = (len(sorted_articles_short) // batch_size
                    // per_epoch) - 1
    print("init value of update_check", update_check)
    gr_learning_rate = learning_rate

    with tf.Session(graph=train_graph) as sess:
        # This is to show graph in tensorboard
        # project path tensorboard --logdir = logs - -port 6006
        # TensorBoard 1.10.0 at http: // Sam: 6006(Press CTRL + C to quit)
        writer = tf.summary.FileWriter('logs', graph=sess.graph)
        sess.run(tf.global_variables_initializer())

        # If we want to continue training a previous session
        # loader = tf.train.import_meta_graph("./" + checkpoint + '.meta')
        # loader.restore(sess, checkpoint)

        for epoch_i in range(1, epochs + 1):
            update_loss = 0
            batch_loss = 0
            for batch_i, (headlines_batch, articles_batch, headlines_lengths,
                        articles_lengths) in enumerate(get_batches(sorted_headlines_short,
                        sorted_articles_short, batch_size, vocab_to_int)):
                print("batch_i ==== ", batch_i)
                start_time = time.time()
                _, loss = sess.run(
                    [train_op, cost],
                    {gen_input_data: articles_batch,
                     gen_targets: headlines_batch,
                     gen_lr: gr_learning_rate,
                     gen_headline_length: headlines_lengths,
                     gen_article_length: articles_lengths,
                     gen_keep_prob: keep_probability})

                batch_loss += loss
                update_loss += loss
                end_time = time.time()
                batch_time = end_time - start_time

                # This prints status after value specified in display_step.
                # Helps to to see progress
                if batch_i % display_step == 0 and batch_i > 0:
                    print('Epoch {}/{} Batch {}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                          .format(epoch_i,
                                  epochs,
                                  batch_i,
                                  len(sorted_articles_short) // batch_size,
                                  batch_loss / display_step,
                                  batch_time * display_step))
                    batch_loss = 0

                # print loss value after after steps specified in update_check
                if batch_i % update_check == 0 and batch_i > 0:
                    print("Average loss for this update:", round(update_loss / update_check, 3))
                    headlines_update_loss.append(update_loss)

                    # If the update loss is at a new minimum, save the model
                    if update_loss <= min(headlines_update_loss):
                        print('New Record!')
                        stop_early = 0
                        saver = tf.train.Saver()
                        saver.save(sess, checkpoint)

                    else:
                        print("No Improvement.")
                        stop_early += 1
                        if stop_early == stop:
                            break
                    update_loss = 0

            # Reduce learning rate, but not below its minimum value
            gr_learning_rate *= learning_rate_decay
            if gr_learning_rate < min_learning_rate:
                gr_learning_rate = min_learning_rate

            if stop_early == stop:
                print("Stopping Training.")
                break

In [101]:
def main():
    print("Prepare input parameters ...")
    sorted_articles, sorted_headlines, vocab_to_int, word_embedding_matrix \
        = create_input_for_graph()
    print("Build Graph parameters ...")
    train_graph, train_op, cost, gen_input_data, gen_targets, gen_lr, gen_keep_prob, \
        gen_headline_length, gen_max_headline_length, \
        gen_article_length = build_graph(vocab_to_int, word_embedding_matrix)

    # Subset the data for training, this is used to check if steps are working fine.
    # In actual run whole data should be taken
    #start = start
    end = start + 4000

    print("Total Articles that we have for this run :", len(sorted_articles))
    # Train the Model
    sorted_headlines_short = sorted_headlines[start:end]
    sorted_articles_short = sorted_articles[start:end]
    print("Total Articles samples taken for this run :", len(sorted_articles_short))
    print("The shortest text length:", len(sorted_articles_short[0]))
    print("The longest text length:", len(sorted_articles_short[-1]))

    train_model(train_graph, train_op, cost, gen_input_data, gen_targets,
                gen_lr, gen_keep_prob, gen_headline_length, gen_max_headline_length,
                gen_article_length, sorted_headlines_short, sorted_articles_short, vocab_to_int)


'''-------------------------main------------------------------'''
main()

Prepare input parameters ...
Loading Pre-Trained Model Pickle..... 
Loaded Pre-Trained Model Pickle, time taken 1.5636331900000036
counting  Articles
counting  Headlines
Total Stories :  5607
Size of Vocabulary: 47067
creating embedding index .....
Word embeddings: 23102
Number of words missing : 5787
Percent of words that are missing from vocabulary: 12.3%
Total number of unique words: 47067
Number of words we will use: 28893
Percent of words we will use: 61.39%
word_embedding_matrix length :  28893
Article Data
Total number of words :  1643175
Total number of UNKs :  21983
Percent of words that are UNK: 1.34%
Headline Data
Total number of words :  182202
Total number of UNKs :  574
Percent of words that are UNK: 0.32%
4538
4538
Build Graph parameters ...
Load input parameter ...
Create instance of seq2seq model parameter ...
Geting embedding for encoder input
Initializing encoder layers
Adding 'GO' to start text
Getting embedding for encoder input
Getting decoding_layer logits ... 
G

KeyboardInterrupt: 