In [1]:
#https://stackoverflow.com/questions/38088652/pandas-convert-categories-to-numbers
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
#ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

def get_pos_tags_dict(words):
    #sent = nltk.word_tokenize(sent)
    #print(sent)
    post_tags_for_words = nltk.pos_tag(words)

    pos_list ={}
    #sent = preprocess(ex)
    for word,pos in post_tags_for_words:
        pos_list[word] = pos
    #print(pos_list)

    import pandas as pd
    df = pd.DataFrame(list(pos_list.items()))
    df.columns = ['word', 'pos']
    df.pos = pd.Categorical(df.pos)
    df['code'] = df.pos.cat.codes
    #print(df)

    pos_list ={}
    for index, row in df.iterrows():
        pos_list[row['word']] = row['code']
#     print(pos_list)
    return pos_list , post_tags_for_words

In [3]:
# In[231]:


from nltk.tokenize import word_tokenize
import re
import collections
import pickle
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

# default_path = "drive/My Drive/NLP/project/rnn-lstm/data/"
default_path = "/media/mount/Users/Ruchit Modi/Documents/CSE538/project/Model2/dataset/"
textrank_base_path = "/media/mount/Users/Ruchit Modi/Documents/CSE538/project/Model2"

train_article_path = default_path + "sumdata/train/train.article.txt"
train_title_path   = default_path + "sumdata/train/train.title.txt"
valid_article_path = default_path + "sumdata/train/valid.article.filter.txt"
valid_title_path   = default_path + "sumdata/train/valid.title.filter.txt"

textrank_train_len = 200000
textrank_valid_len = 50

def clean_str(sentence):
    sentence = re.sub("[#.]+", "#", sentence)
    return sentence


def get_text_list(data_path, toy):
    with open (data_path, "r", encoding="utf-8") as f:
        if not toy:
            return [clean_str(x.strip()) for x in f.readlines()][:textrank_train_len]
        else:
            return [clean_str(x.strip()) for x in f.readlines()][:textrank_valid_len]
        
def build_dict(step, toy=False):
    if step == "train":
        train_article_list = get_text_list(train_article_path, toy)
        train_title_list = get_text_list(train_title_path, toy)

        words = list()
        for sentence in train_article_list + train_title_list:
            for word in word_tokenize(sentence):
                words.append(word)

        word_counter = collections.Counter(words).most_common()
        word_dict = dict()
        word_dict["<padding>"] = 0
        word_dict["<unk>"] = 1
        word_dict["<s>"] = 2
        word_dict["</s>"] = 3
        for word, _ in word_counter:
            word_dict[word] = len(word_dict)

        with open(default_path + "word_dict.pickle", "wb") as f:
            pickle.dump(word_dict, f)

    elif step == "valid":
        with open(default_path + "word_dict.pickle", "rb") as f:
            word_dict = pickle.load(f)

    reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))

    article_max_len = 50
    summary_max_len = 15

    return word_dict, reversed_dict, article_max_len, summary_max_len


def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
    if step == "train":
        article_list = get_text_list(train_article_path, toy)
        title_list = get_text_list(train_title_path, toy)
    elif step == "valid":
        # Only when args.toy == True
        article_list = get_text_list(valid_article_path, toy)
    else:
        raise NotImplementedError

    x = [word_tokenize(d) for d in article_list]
    x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x]
    x = [d[:article_max_len] for d in x]
    x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]
    
    if step == "valid":
        return x
    else:        
        y = [word_tokenize(d) for d in title_list]
        y = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in y]
        y = [d[:(summary_max_len - 1)] for d in y]
        return x, y


def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    sentence_ids = [[i] for i in range(inputs.shape[0])]
    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index], sentence_ids[start_index:end_index]


def get_init_embedding(word_dict , reversed_dict, embedding_size):
    print("Loading Lists...")
    train_article_list = get_text_list(train_article_path, False)
    train_title_list = get_text_list(train_title_path, False)

    print("Loading TF-IDF...")
    tf_idf_list = tf_idf_generate(train_article_list+train_title_list)
    
    print("Loading Pos Tags...")
    pos_list , postags_for_named_entity = get_pos_tags_dict(word_dict.keys())

    #print("Loading Named Entity...")
    #named_entity_recs = named_entity(postags_for_named_entity) 
    
    print("Loading Glove vectors...")

    with open( default_path + "glove/model_glove_300.pkl", 'rb') as handle:
        word_vectors = pickle.load(handle)     
    
    used_words = 0
    word_vec_list = list()
    for _, word in sorted(reversed_dict.items()):
        try:
            word_vec = word_vectors.word_vec(word)
            if word in tf_idf_list:
                v= tf_idf_list[word]
                rich_feature_array = np.array([v,v,v,v,v,v,v,v,v,v])
                word_vec = np.append(word_vec, rich_feature_array)
            else:
                v=0
                rich_feature_array = np.array([v,v,v,v,v,v,v,v,v,v])
                word_vec = np.append(word_vec, rich_feature_array)

            if word in pos_list:
                v=pos_list[word]
                rich_feature_array_2 = np.array([v,v,v,v,v,v,v,v,v,v])
                word_vec = np.append(word_vec, rich_feature_array_2)
            else:
                v=0
                rich_feature_array_2 = np.array([v,v,v,v,v,v,v,v,v,v])
                word_vec = np.append(word_vec, rich_feature_array_2) 
          
            used_words += 1
        except KeyError:
            word_vec = np.zeros([embedding_size], dtype=np.float32) #to generate for <padding> and <unk>
        
        
        word_vec_list.append(np.array(word_vec))

    print("words found in glove percentage = " + str((used_words/len(word_vec_list))*100) )
          
    # Assign random vector to <s>, </s> token
    word_vec_list[2] = np.random.normal(0, 1, embedding_size)
    word_vec_list[3] = np.random.normal(0, 1, embedding_size)

    return np.array(word_vec_list)

In [4]:
# In[232]:


# _____TF-IDF libraries_____
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# _____helper Libraries_____
import pickle  # would be used for saving temp files
import csv     # used for accessing the dataset
import timeit  # to measure time of training
import random  # used to get a random number


def tf_idf_generate(sentences):
    #https://stackoverflow.com/questions/30976120/find-the-tf-idf-score-of-specific-words-in-documents-using-sklearn

    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    # our corpus
    data = sentences

    cv = CountVectorizer()

    # convert text data into term-frequency matrix
    data = cv.fit_transform(data)

    tfidf_transformer = TfidfTransformer()

    # convert term-frequency matrix into tf-idf
    tfidf_matrix = tfidf_transformer.fit_transform(data)

    # create dictionary to find a tfidf word each word
    word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))
  
    return word2tfidf

# https://nlpforhackers.io/named-entity-extraction/
from nltk import word_tokenize, pos_tag, ne_chunk


def named_entity(post_tags_for_words):
    names = ne_chunk(post_tags_for_words)
    names_dict = {}
    for n in names:
        if (len(n) == 1):
            named_entity = str(n).split(' ')[0][1:]
            word = str(n).split(' ')[1].split('/')[0]
            names_dict[word] = named_entity
    print(names_dict)


    import pandas as pd

    df = pd.DataFrame(list(names_dict.items()))
    df.columns = ['word', 'pos']
    df.pos = pd.Categorical(df.pos)
    df['code'] = df.pos.cat.codes
    # print(df)

    names_dict = {}
    for index, row in df.iterrows():
        names_dict[row['word']] = row['code']
    print(names_dict)
    return names_dict



## Generating dependencies

print("Building dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("train", False)

print("Loading training dataset...")
train_x, train_y = build_dataset("train", word_dict, article_max_len, summary_max_len, False)

print("Building pos-tags")
pos_list , postags_for_named_entity = get_pos_tags_dict(word_dict.keys())

print("Loading Lists...")
train_article_list = get_text_list(train_article_path, False)
train_title_list = get_text_list(train_title_path, False)

print("Loading TF-IDF...")
tf_idf_list = tf_idf_generate(train_article_list+train_title_list)
    
print("Currently assuming: article_max_len : " + str(article_max_len))
print("Currently assuming: summary_max_len : " + str(summary_max_len))

print("Building input embeddings...")
input_embedding = get_init_embedding(word_dict , reversed_dict, 320)

def generate_article_scores(step, toy=False):
    sentence_to_word_dic = {}
    cache_miss = 0
    total_found = 0
    f = open(textrank_base_path + '/{}_article_scores.pickle'.format(step), 'rb')
    import pickle
    article_text_rank_scores = pickle.load(f)
    for article_id in article_text_rank_scores:
        word_dic = {}
        for word, score in article_text_rank_scores[article_id]:
            word_id = word_dict.get(word, None)
            if word_id is not None:
                word_dic[word_id] = score
                total_found += 1
            else:
                cache_miss += 1
        sentence_to_word_dic[article_id] = word_dic
    print('Cache miss percentage: {} %'.format(100*cache_miss/(cache_miss + total_found)))
    return sentence_to_word_dic


def build_train_sentence_textrank_mat(step, word_dict, article_max_len, summary_max_len, toy=False):
    if step == "train":
        article_list = get_text_list(train_article_path, toy)
    elif step == "valid":
        article_list = get_text_list(valid_article_path, toy)
    else:
        raise NotImplementedError
    sentence_to_word_dic = generate_article_scores(step, toy)

    x = [word_tokenize(d) for d in article_list]
    sentence_mat_x = []
    for idx, sentence in enumerate(x):
        word_textrank_dic = sentence_to_word_dic[idx]
        word_scores = []
        for token in sentence:
            word_scores.append(word_textrank_dic.get(word_dict.get(token, None), 0.0))
        
        word_scores = word_scores[:article_max_len]
        word_scores = word_scores + [0.0] * (article_max_len - len(word_scores))
        sentence_mat_x.append(np.array(word_scores))
   
    return np.array(sentence_mat_x)

Building dictionary...
Loading training dataset...
Building pos-tags
Loading Lists...
Loading TF-IDF...
Currently assuming: article_max_len : 50
Currently assuming: summary_max_len : 15
Building input embeddings...
Loading Lists...
Loading TF-IDF...
Loading Pos Tags...
Loading Glove vectors...
words found in glove percentage = 91.75771029889796


In [5]:
print("Building training sentence textrank matrix")
train_sentence_textrank_mat = build_train_sentence_textrank_mat("train", word_dict, article_max_len, summary_max_len, False)

print("Max value in train_sentence_textrank_mat: " + str(np.max(train_sentence_textrank_mat)) + " Shape: " + str(train_sentence_textrank_mat.shape))




import tensorflow as tf
from tensorflow.contrib import rnn
#from utils import get_init_embedding

In [45]:
class Model(object):
    def __init__(self, reversed_dict, article_max_len, summary_max_len, args, forward_only=False):
        self.vocabulary_size = len(reversed_dict)
        self.embedding_size = args.embedding_size
        self.num_hidden = args.num_hidden
        self.num_layers = args.num_layers
        self.learning_rate = args.learning_rate
        self.beam_width = args.beam_width
        if not forward_only:
            self.keep_prob = args.keep_prob
        else:
            self.keep_prob = 1.0
        self.cell = tf.nn.rnn_cell.BasicLSTMCell
        with tf.variable_scope("decoder/projection"):
            self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False)

        self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
        self.X = tf.placeholder(tf.int32, [None, article_max_len])
        self.X_len = tf.placeholder(tf.int32, [None])
        
        self.sentence_ids = tf.placeholder(tf.int32, [None, 1])
        
        self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
        self.decoder_len = tf.placeholder(tf.int32, [None])
        self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
        self.global_step = tf.Variable(0, trainable=False)

        # Debugging variables
        self.my_debug_inp = None
        self.my_debug_inp2 = None
        
        with tf.name_scope("embedding"):
            train_sentence_textrank_mat_tf = tf.constant(train_sentence_textrank_mat, dtype=tf.float32)
            if not forward_only and args.glove: #training
                #init_embeddings = tf.constant(get_init_embedding(word_dict ,reversed_dict, self.embedding_size), dtype=tf.float32)
                init_embeddings = tf.constant(input_embedding, dtype=tf.float32)
                
            else: #testing
                init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
                # TODO: Handle sentence-textrank matrix case here (Test case)
                
                
            self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings)
#             self.sentence_text_rank_mat = tf.get_variable("sentence_text_rank_mat", initializer=train_sentence_textrank_mat_tf)
#             self.encoder_textrank_inp = tf.transpose(tf.nn.embedding_lookup(self.sentence_text_rank_mat, self.sentence_ids), perm=[2, 0, 1])
#             self.encoder_textrank_inp = tf.tile(self.encoder_textrank_inp, multiples=[1, 1, 10])
            
            self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2])
            self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.decoder_input), perm=[1, 0, 2])

#             self.encoder_emb_inp = tf.concat([self.encoder_emb_inp, self.encoder_textrank_inp], axis=2)
            
#             self.my_debug_inp = self.final_encoder_emb_inp
            
#             self.my_debug_inp2 = self.encoder_emb_inp
            

        with tf.name_scope("encoder"):
            fw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            bw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            # TODO: Dropout
            fw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in fw_cells]
            bw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in bw_cells]

            encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                fw_cells, bw_cells, self.encoder_emb_inp,
                sequence_length=self.X_len, time_major=True, dtype=tf.float32)
            self.encoder_output = tf.concat(encoder_outputs, 2)
            encoder_state_c = tf.concat((encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
            encoder_state_h = tf.concat((encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
            self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

        with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope:
            decoder_cell = self.cell(self.num_hidden * 2)

            if not forward_only: #trainig
                attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
                initial_state = initial_state.clone(cell_state=self.encoder_state)
                helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_emb_inp, self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, scope=decoder_scope)
                self.decoder_output = outputs.rnn_output
                self.logits = tf.transpose(
                    self.projection_layer(self.decoder_output), perm=[1, 0, 2])
                self.logits_reshape = tf.concat(
                    [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1)
            else: #testing
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width)
                tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, multiplier=self.beam_width)
                tiled_seq_len = tf.contrib.seq2seq.tile_batch(self.X_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
                initial_state = initial_state.clone(cell_state=tiled_encoder_final_state)
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=self.embeddings,
                    start_tokens=tf.fill([self.batch_size], tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer
                )
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope)
                self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])

        with tf.name_scope("loss"):
            if not forward_only: #training
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits_reshape, labels=self.decoder_target)
                weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32)
                self.loss = tf.reduce_sum(crossent * weights / tf.to_float(self.batch_size))
                # TODO: Regularization
                params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.update = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

In [None]:
print("Starting Training")
                
from tqdm import tqdm
import time
start = time.perf_counter()
import tensorflow as tf
import argparse
import pickle
import os

class args:
    pass
  
args.num_hidden=150
args.num_layers=2
args.beam_width=10
args.glove="store_true"
args.embedding_size=320

args.learning_rate=1e-3
args.batch_size=64
args.num_epochs=10
args.keep_prob = 0.8

args.toy=False #"store_true"

args.with_model="store_true"

if not os.path.exists(default_path + "saved_model_textrank"):
    os.mkdir(default_path + "saved_model_textrank")
else:
    if args.with_model:
        old_model_checkpoint_path = open(default_path + 'saved_model_textrank/checkpoint', 'r')
#         old_model_checkpoint_path = "".join([default_path + "saved_model_2/",old_model_checkpoint_path.read().splitlines()[0].split('"')[1] ])
        old_model_checkpoint_path = old_model_checkpoint_path.read().splitlines()[0].split('"')[1]


tf.reset_default_graph()

with tf.Session() as sess:
    model = Model(reversed_dict, article_max_len, summary_max_len, args)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())
 
    batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs)
    num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1

    print("\nIteration starts.")
    print("Number of batches per epoch :", num_batches_per_epoch)
    total_batches_done = 0
    
    
    for batch_x, batch_y, sentence_ids in tqdm(batches):
        batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
        batch_decoder_input = list(map(lambda x: [word_dict["<s>"]] + list(x), batch_y))
        batch_decoder_len = list(map(lambda x: len([y for y in x if y != 0]), batch_decoder_input))
        batch_decoder_output = list(map(lambda x: list(x) + [word_dict["</s>"]], batch_y))

        batch_decoder_input = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_input))
        batch_decoder_output = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_output))
        
        total_batches_done += 1
        
        train_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
            model.decoder_input: batch_decoder_input,
            model.decoder_len: batch_decoder_len,
            model.decoder_target: batch_decoder_output,
            model.sentence_ids: sentence_ids
        }

        _, step, loss = sess.run([model.update, model.global_step, model.loss], feed_dict=train_feed_dict)
        
        if step % 1000 == 0:
            print("step {0}: loss = {1}".format(step, loss))

        if step % num_batches_per_epoch == 0:
            hours, rem = divmod(time.perf_counter() - start, 3600)
            minutes, seconds = divmod(rem, 60)
            saver.save(sess, default_path + "saved_model_textrank/model.ckpt", global_step=step)
            print(" Epoch {0}: Model is saved.".format(step // num_batches_per_epoch),
            "Elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds) , "\n")

Starting Training
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


0it [00:00, ?it/s]


Iteration starts.
Number of batches per epoch : 3125


1000it [05:33,  2.86it/s]

step 1000: loss = 52.490299224853516


1961it [10:58,  3.10it/s]

In [43]:
print("Building validation sentence textrank matrix")

sentence_textrank_valid_mat = build_train_sentence_textrank_mat("valid", word_dict, article_max_len, summary_max_len, True)

print("Max value in sentence_textrank_valid_mat: " + str(np.max(sentence_textrank_valid_mat)) + " Shape: " + str(sentence_textrank_valid_mat.shape))


if sentence_textrank_valid_mat.shape[0] < textrank_train_len:
    padding_rows = np.zeros((textrank_train_len - sentence_textrank_valid_mat.shape[0], sentence_textrank_valid_mat.shape[1]))
    sentence_textrank_valid_mat = np.concatenate([sentence_textrank_valid_mat, padding_rows])

print("Shape of sentence_textrank_valid_mat after padding: " + str(sentence_textrank_valid_mat.shape))

In [47]:
import tensorflow as tf
import pickle

tf.reset_default_graph()

class args:
    pass
  
args.num_hidden=150
args.num_layers=2
args.beam_width=10
args.glove="store_true"
args.embedding_size=320

args.learning_rate=1e-3
args.batch_size=64
args.num_epochs=10
args.keep_prob = 0.8

args.toy=True

args.with_model="store_true"



print("Loading dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("valid", args.toy)
print("Loading validation dataset...")
valid_x = build_dataset("valid", word_dict, article_max_len, summary_max_len, args.toy)
valid_x_len = [len([y for y in x if y != 0]) for x in valid_x]
print("Loading article and reference...")
article = get_text_list(valid_article_path, args.toy)
reference = get_text_list(valid_title_path, args.toy)


f = open(default_path + "textrank_results.txt", "w")

with tf.Session() as sess:
    print("Loading saved model...")
    model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True)
    saver = tf.train.Saver(tf.global_variables())
    ckpt = tf.train.get_checkpoint_state(default_path + "saved_model_baseline_2epoch_200000/")
    saver.restore(sess, ckpt.model_checkpoint_path)

    batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1)

    print("Writing summaries to 'textrank_results.txt'...")
    i = 0
    for batch_x, _, sentence_ids in batches:
        batch_x_len = [len([y for y in x if y != 0]) for x in batch_x]

        valid_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
            model.sentence_ids: sentence_ids
        }

        prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
        prediction_output = [[reversed_dict[y] for y in x] for x in prediction[:, 0, :]]
        summary_array = []
        j = 0
        for line in prediction_output:
            summary = list()
            for word in line:
                if word == "</s>":
                    break
                if word not in summary:
                    summary.append(word)
            summary_array.append(" ".join(summary))
            write_line = '==============>>>>>>>>'.join([article[i+j], " ".join(summary)]) + '\n'
            separator = "==========================================="
            write_line = article[i+j] + "\n" + separator + "\n" + reference[i+j] + "\n" + separator + "\n" +  " ".join(summary) + "\n"
            write_line = write_line + "++++++++++++++++++++++++++++++++++++++++++++++++++++" + "\n" + "\n"
            f.write(write_line)
            j += 1
            #print(" ".join(summary), file=f)
        i += len(batch_x)

    print('Summaries have been generated')
f.close()

Loading dictionary...
Loading validation dataset...
Loading article and reference...
Loading saved model...
INFO:tensorflow:Restoring parameters from /media/mount/Users/Ruchit Modi/Documents/CSE538/project/Model2/dataset/saved_model_baseline/model.ckpt-6250
Writing summaries to 'textrank_results.txt'...
Summaries have been generated


In [7]:
summary_array = []
for line in prediction_output:
    summary = list()
    for word in line:
        if word == "</s>":
            break
        if word not in summary:
            summary.append(word)
    summary_array.append(" ".join(summary))

In [55]:
#https://github.com/chakki-works/sumeval
#https://github.com/Tian312/awesome-text-summarization

from sumeval.metrics.rouge import RougeCalculator
from sumeval.metrics.bleu import BLEUCalculator

def eval_rouges(refrence_summary,model_summary):
    rouge = RougeCalculator(stopwords=True, lang="en")

    rouge_1 = rouge.rouge_n(
                summary=model_summary,
                references=refrence_summary,
                n=1)

    rouge_2 = rouge.rouge_n(
                summary=model_summary,
                references=[refrence_summary],
                n=2)
    
    rouge_l = rouge.rouge_l(
                summary=model_summary,
                references=[refrence_summary])
    
    # You need spaCy to calculate ROUGE-BE
    
    rouge_be = rouge.rouge_be(
                summary=model_summary,
                references=[refrence_summary])

    bleu = BLEUCalculator()
    bleu_score = bleu.bleu( summary=model_summary,
                        references=[refrence_summary])

    print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(
        rouge_1, rouge_2, rouge_l, rouge_be
    ).replace(", ", "\n"))
    
    return rouge_1, rouge_2,rouge_l,rouge_be,bleu_score

refrence_summary = "two egyptian guards killed on border with gaza"
model_summary = "two egyptian border guards killed in clashes"
eval_rouges(refrence_summary,model_summary)
#rouge_1, rouge_2,rouge_l,rouge_be = eval_rouges( "tokyo shares close up #.## percent",  "tokyo stocks close up # percent to fresh record high")

#print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(     rouge_1, rouge_2, rouge_l, rouge_be).replace(", ", "\n"))

b.egyptian=(amod)=>guards
a.guards=(nsubj)=>killed
<BasicElement: guards-[nsubj]->kill>
b.egyptian=(amod)=>guards
a.guards=(nsubj)=>killed
<BasicElement: guards-[nsubj]->kill>
ROUGE-1: 0.8000000000000002
ROUGE-2: 0.25
ROUGE-L: 0.6
ROUGE-BE: 1.0


(0.8000000000000002, 0.25, 0.6, 1.0, 2.8634401465295505)

In [48]:
#https://pymotw.com/2/xml/etree/ElementTree/create.html

bleu_arr = []
rouge_1_arr  = []
rouge_2_arr  = []
rouge_L_arr  = []
rouge_be_arr = []

from xml.etree import ElementTree
from xml.dom import minidom
from functools import reduce

def prettify(elem):
    """Return a pretty-printed XML string for the Element.
    """
    rough_string = ElementTree.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")
  
from xml.etree.ElementTree import Element, SubElement, Comment

top = Element('TextRank Evaluation')

comment = Comment('Generated by Amr Zaki')
top.append(comment)

i=0
for summ in summary_array:
    example = SubElement(top, 'example')
    article_element   = SubElement(example, 'article')
    article_element.text = article[i]

    reference_element = SubElement(example, 'reference')
    reference_element.text = reference[i]

    summary_element   = SubElement(example, 'summary')
    summary_element.text = summ

    rouge_1, rouge_2,rouge_L,rouge_be,bleu_score = eval_rouges(reference[i],summ )

    eval_element = SubElement(example, 'eval')
    bleu_score_element = SubElement(eval_element,'BLEU', {'score':str(bleu_score)})
    ROUGE_1_element  = SubElement(eval_element, 'ROUGE_1' , {'score':str(rouge_1)})
    ROUGE_2_element  = SubElement(eval_element, 'ROUGE_2' , {'score':str(rouge_2)})
    ROUGE_L_element  = SubElement(eval_element, 'ROUGE_l' , {'score':str(rouge_L)})
    ROUGE_be_element  = SubElement(eval_element,'ROUGE_be', {'score':str(rouge_be)})

    bleu_arr.append(bleu_score) 
    rouge_1_arr.append(rouge_1) 
    rouge_2_arr.append(rouge_2) 
    rouge_L_arr.append(rouge_L) 
    rouge_be_arr.append(rouge_be) 

    i+=1

top.set('bleu', str(reduce(lambda x, y: x + y,  bleu_arr) / len(bleu_arr)))
top.set('rouge_1', str(reduce(lambda x, y: x + y,  rouge_1_arr) / len(rouge_1_arr)))
top.set('rouge_2', str(reduce(lambda x, y: x + y,  rouge_2_arr) / len(rouge_2_arr)))
top.set('rouge_L', str(reduce(lambda x, y: x + y,  rouge_L_arr) / len(rouge_L_arr)))
top.set('rouge_be', str(reduce(lambda x, y: x + y, rouge_be_arr) / len(rouge_be_arr)))

results = {}
results['bleu'] = {'avg': sum(bleu_arr) / len(bleu_arr), 'min': min(bleu_arr), 'max': max(bleu_arr)}
results['rouge_1'] = {'avg': sum(rouge_1_arr) / len(rouge_1_arr), 'min': min(rouge_1_arr), 'max': max(rouge_1_arr)}
results['rouge_2'] = {'avg': sum(rouge_2_arr) / len(rouge_2_arr), 'min': min(rouge_2_arr), 'max': max(rouge_2_arr)}
results['rouge_L'] = {'avg': sum(rouge_L_arr) / len(rouge_L_arr), 'min': min(rouge_L_arr), 'max': max(rouge_L_arr)}
results['rouge_be'] = {'avg': sum(rouge_be_arr) / len(rouge_be_arr), 'min': min(rouge_be_arr), 'max': max(rouge_be_arr)}

with open(default_path + 'baseline_evaluation_2.json', 'w') as f:
    f.write(json.dumps(results) + '\n')

b.skating=(compound)=>championships
a.injury=(nsubj)=>leaves
<BasicElement: injury-[nsubj]->leave>
a.hopes=(dobj)=>leaves
<BasicElement: hopes-[dobj]->leave>
a.leaders=(nsubj)=>lash
<BasicElement: leaders-[nsubj]->lash>
b.illegal=(amod)=>immigrants
b.tough=(amod)=>law
a.sales=(nsubj)=>fall
<BasicElement: sales-[nsubj]->fall>
a.percent=(npadvmod)=>fall
<BasicElement: percent-[npadvmod]->fall>
a.thousands=(nsubj)=>celebrate
<BasicElement: thousands-[nsubj]->celebrate>
b.liberian=(amod)=>president
a.president=(dobj)=>attend
<BasicElement: president-[dobj]->attend>
b.attend=(advcl)=>rice
a.inauguration=(dobj)=>attend
<BasicElement: inauguration-[dobj]->attend>
b.top=(amod)=>lobbyist
b.republican=(amod)=>lobbyist
a.lobbyist=(nsubj)=>pleads
<BasicElement: lobbyist-[nsubj]->plead>
a.speaker=(nsubj)=>agrees
<BasicElement: speaker-[nsubj]->agree>
a.workers=(nsubj)=>strike
<BasicElement: workers-[nsubj]->strike>
b.portuguese=(amod)=>workers
a.strike=(nsubj)=>ground
<BasicElement: strike-[nsubj]-