# Import required libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import json

# Read reviews and summaries csv file

In [None]:
from google.colab import drive
drive.mount("/content/drive")

data = pd.read_csv("/content/drive/MyDrive/reviews.csv")

In [None]:
# Drop null entries and extract reviews & summary columns

data = data.dropna()
data = data[['Summary', 'Text']].reset_index(drop=True)

In [None]:
data.head()

In [None]:
training_data, testing_data = train_test_split(data, test_size=0.2, random_state=25)

In [None]:
print(f"Training Data : {len(training_data)}")
print(f"Testing Data  : {len(testing_data)}")

# Clean and prepare the data for model training

In [None]:
contractions = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he's": "he is",
  "how'd": "how did",
  "how'll": "how will",
  "how's": "how is",
  "i'd": "i would",
  "i'll": "i will",
  "i'm": "i am",
  "i've": "i have",
  "isn't": "is not",
  "it'd": "it would",
  "it'll": "it will",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "must've": "must have",
  "mustn't": "must not",
  "needn't": "need not",
  "oughtn't": "ought not",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "she'd": "she would",
  "she'll": "she will",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "that'd": "that would",
  "that's": "that is",
  "there'd": "there had",
  "there's": "there is",
  "they'd": "they would",
  "they'll": "they will",
  "they're": "they are",
  "they've": "they have",
  "wasn't": "was not",
  "we'd": "we would",
  "we'll": "we will",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "where'd": "where did",
  "where's": "where is",
  "who'll": "who will",
  "who's": "who is",
  "won't": "will not",
  "wouldn't": "would not",
  "you'd": "you would",
  "you'll": "you will",
  "you're": "you are"
}

In [None]:
def clean_text(text, remove_stopwords = True):
    """
    Cleans the text by performing the following operations -
      1. Remove contractions.
      2. Remove links, html tags and other words.
      3. Remove any english language stopwords.
    """

    text = text.lower()
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)


    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [None]:
# Create a word count dictionary
def count_words(count_dict, text):
  for sentence in text:
    for word in sentence.split():
      if word not in count_dict:
        count_dict[word] = 1
      else:
        count_dict[word] += 1

In [None]:
# Load Conceptnet Numberbatch's (CN) embeddings
def get_embeddings():
  embeddings_index = {}
  with open('text_summarizer/data/numberbatch-en-17.04b.txt', encoding='utf-8') as f:
      for line in f:
          values = line.split(' ')
          word = values[0]
          embedding = np.asarray(values[1:], dtype='float32')
          embeddings_index[word] = embedding
  return embeddings_index

In [None]:
# Converting words which are present in the text to numeric value. If the word is not present in the vocb_to_int dictionary then UNK's integer is used.
# Totalling the number of words and UNKs. Also adding EOS token to the end of the text
def convert_to_ints(text, word_count, unk_count, eos=False):
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count

In [None]:
# This method creates a dataframe of the sequence length from the given text
def create_lengths(text):
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [None]:
# This method counts the number of time UNK appears in a given sentence
def unk_counter(sentence):
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

In [None]:
# Perform clean operation on the review text and summaries
clean_summaries = []
for summary in training_data.Summary:
    clean_summaries.append(clean_text(summary, remove_stopwords=False))

clean_texts = []
for text in training_data.Text:
    clean_texts.append(clean_text(text))

In [None]:
word_idx = {}

count_words(word_idx, clean_summaries)
count_words(word_idx, clean_texts)

print("Size of Vocabulary:", len(word_idx))

In [None]:
embeddings_idx = get_embeddings()
print('Word embeddings:', len(embeddings_idx))

In [None]:
# displaying the number of words which were not present in the embedding dict and which are used more than the threshold.
# initialised the threshold value to be 20. Because the words must be common enough.
missing_words = 0
threshold = 20

for word, count in word_idx.items():
    if count > threshold:
        if word not in embeddings_idx:
            missing_words += 1

missing_ratio = round(missing_words/len(word_idx),4)*100

print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

In [None]:
#dictionary to convert words to integers
vocab_to_int = {}

value = 0
for word, count in word_idx.items():
    if count >= threshold or word in embeddings_idx:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_idx),4)*100

print("Total number of unique words:", len(word_idx))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(round(usage_ratio, 3)))

In [None]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_idx:
        word_embedding_matrix[i] = embeddings_idx[word]
    else:
        # If word not in CN, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_idx[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

print(len(word_embedding_matrix))  # Check if value matches len(vocab_to_int)

In [None]:
# Apply convert_to_ints to clean_summaries and clean_texts
word_count = 0
unk_count = 0

int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count)
int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

In [None]:
lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)

In [None]:
%load_ext tensorboard

In [None]:
# Sort the summaries and texts by the length of the texts, shortest to longest
# Limit the length of summaries and texts based on the min and max ranges.
# Remove reviews that include too many UNKs

sorted_summaries = []
sorted_texts = []
max_text_length = 84
max_summary_length = 13
min_length = 2
unk_text_limit = 1
unk_summary_limit = 0

for length in range(min(lengths_texts.counts), max_text_length):
    for count, words in enumerate(int_summaries):
        if (len(int_summaries[count]) >= min_length and
            len(int_summaries[count]) <= max_summary_length and
            len(int_texts[count]) >= min_length and
            unk_counter(int_summaries[count]) <= unk_summary_limit and
            unk_counter(int_texts[count]) <= unk_text_limit and
            length == len(int_texts[count])
           ):
            sorted_summaries.append(int_summaries[count])
            sorted_texts.append(int_texts[count])

# Compare lengths to ensure they match
print(len(sorted_summaries))
print(len(sorted_texts))

In [None]:
with open('vocab2int.json', 'w') as fp:
    json.dump(vocab_to_int, fp)
with open('int2vocab.json', 'w') as fp:
    json.dump(int_to_vocab, fp)

# Model Creation

In [None]:
def model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
    max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    text_length = tf.placeholder(tf.int32, (None,), name='text_length')

    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length

In [None]:
# Remove the last word index from each batch and then concat the <Go> to the beginning of every batch
def process_encoding_input(target_data, vocab_to_int, batch_size):
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

In [None]:
# Creating the enconding layer
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw,
                                                    input_keep_prob = keep_prob)

            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw,
                                                    input_keep_prob = keep_prob)

            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw,
                                                                    cell_bw,
                                                                    rnn_inputs,
                                                                    sequence_length,
                                                                    dtype=tf.float32)

    enc_output = tf.concat(enc_output,2) # Join outputs since we are using a bidirectional RNN

    return enc_output, enc_state

In [None]:
# Creating the training logits
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer,
                            vocab_size, max_summary_length):
    print("Training Started.....")
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=summary_length,
                                                        time_major=False)
    print("Helper : ", training_helper)
    print("Basic Decoder Started.....")
    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                       training_helper,
                                                       initial_state,
                                                       output_layer)
    print("Decoder : ", training_decoder)
    print("Dynamic Decoder Started.....")
    training_logits,_,_= tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_summary_length)

    print("Training Ended........")
    return training_logits

In [None]:
# Creating the inference decoding layer
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_summary_length, batch_size):

    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    print("Inference Started.....")
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                start_tokens,
                                                                end_token)
    print("Helper : ", inference_helper)
    print("Basic Decoder Started.....")
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        initial_state,
                                                        output_layer)
    print("Decoder : ", inference_decoder)
    print("Dynamic Decoder Started.....")

    inference_logits,_,_ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=max_summary_length)
    print("Inference Ended........")
    return inference_logits

In [None]:
# Create the decoding cell and attention for the training and inference decoding layers

def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length,
                   max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):

    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            dec_cell = tf.contrib.rnn.DropoutWrapper(lstm,
                                                     input_keep_prob = keep_prob)

    output_layer = Dense(vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))

    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                  enc_output,
                                                  text_length,
                                                  normalize=False,
                                                  name='BahdanauAttention')

    dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell,
                                                          attn_mech,
                                                          rnn_size)



    initial_state = dec_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
    initial_state = initial_state.clone(cell_state=enc_state[0])


    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input,
                                                  summary_length,
                                                  dec_cell,
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size,
                                                  max_summary_length)
        print("Training Logics : ", training_logits)
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,
                                                    vocab_to_int['<GO>'],
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell,
                                                    initial_state,
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)
        print("Inference Logics : ", inference_logits)

    return training_logits, inference_logits

In [None]:
def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length,
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size):


    # Use Numberbatch's embeddings and the newly created ones as our embeddings
    embeddings = word_embedding_matrix

    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)

    dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    print("Starting Decoding Layer")
    training_logits, inference_logits  = decoding_layer(dec_embed_input,
                                                        embeddings,
                                                        enc_output,
                                                        enc_state,
                                                        vocab_size,
                                                        text_length,
                                                        summary_length,
                                                        max_summary_length,
                                                        rnn_size,
                                                        vocab_to_int,
                                                        keep_prob,
                                                        batch_size,
                                                        num_layers)

    print("Training Length : ", len(training_logits))
    return training_logits, inference_logits

In [None]:
# Pad sentences with <PAD> so that each sentence of a batch has the same length

def pad_sentence_batch(sentence_batch):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [None]:
# Batch summaries, texts, and the lengths of their sentences together

def get_batches(summaries, texts, batch_size):
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        texts_batch = texts[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))

        # Need the lengths for the _lengths parameters
        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))

        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_texts_lengths.append(len(text))

        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths, pad_texts_lengths

In [None]:
# Set the Hyperparameters
epochs = 10
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75

In [None]:
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():

    # Load the model inputs
    input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets,
                                                      keep_prob,
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(vocab_to_int)+1,
                                                      rnn_size,
                                                      num_layers,
                                                      vocab_to_int,
                                                      batch_size)


    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    # Create the weights for sequence_loss
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")

# Model training and results

In [None]:
# Subset the data for training
start = 200000
end = start + 30000
sorted_summaries_short = sorted_summaries[start:end]
sorted_texts_short = sorted_texts[start:end]
print("The shortest text length:", len(sorted_texts_short[0]))
print("The longest text length:",len(sorted_texts_short[-1]))

In [None]:
# Train the Model
train_model_start=time.time()
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 20
stop_early = 0
stop = 2
per_epoch = 6
update_check = (len(sorted_texts_short)//batch_size//per_epoch)-1

update_loss = 0
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model

checkpoint = "seq2seq_model.ckpt"
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    # If we want to continue training a previous session
    #loader = tf.train.import_meta_graph("./" + checkpoint + '.meta')
    #loader.restore(sess, checkpoint)

    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                get_batches(sorted_summaries_short, sorted_texts_short, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learning_rate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs,
                              batch_i,
                              len(sorted_texts_short) // batch_size,
                              batch_loss / display_step,
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                print("Average loss for this update:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)

                # If the update loss is at a new minimum, save the model
                if update_loss <= min(summary_update_loss):
                    print('New Record!')
                    stop_early=0
                    saver = tf.train.Saver()
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0


        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate

        if stop_early == stop:
            print("Stopping Training.")
            break
train_model_end=time.time()
print(f"Training Model Took : {train_model_end-train_model_start}")

In [None]:
def text_to_seq(text):
    '''Prepare the text for the model'''

    text = clean_text(text)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in text.split()]

In [None]:
def get_summary_from_S2S_model(reviews, generated_summary, summary):


  batch_size=64
  checkpoint = "./seq2seq_model.ckpt"
  loaded_graph = tf.Graph()
  with tf.Session(graph=loaded_graph) as sess:
      # Load saved model
      loader = tf.train.import_meta_graph(checkpoint + '.meta')
      loader.restore(sess, checkpoint)

      input_data = loaded_graph.get_tensor_by_name('input:0')
      logits = loaded_graph.get_tensor_by_name('predictions:0')
      text_length = loaded_graph.get_tensor_by_name('text_length:0')
      summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
      keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')

      #Multiply by batch_size to match the model's input parameters
      for i, data in enumerate(reviews):
        try:
          text = text_to_seq(data)
          answer_logits = sess.run(logits, {input_data: [text]*batch_size,
                                            summary_length: [np.random.randint(5,8)],
                                            text_length: [len(text)]*batch_size,
                                            keep_prob: 1.0})[0]
          pad = vocab_to_int["<PAD>"]
          res=[int_to_vocab[i] for idx, i in enumerate(answer_logits) if i != pad ]
          result=[x for i, x in enumerate(res) if res.index(x)==i]
          generated_summary.append(" ".join(result))
        except:
          generated_summary.append(summary[i].lower())
        print("Generated Summary : ", i+1)
        print("Review : ", reviews[i])
        print("Summary : ", summary[i])
        print("Generated : ", generated_summary[i])
        rouge_matrix = r.compute(predictions=[generated_summary[i]], references=[summary[i]])
        print("Rouge : ", rouge_matrix)

        print("-------------------------------------------------------------------------")


In [None]:
%tensorboard --logdir logs

In [None]:
def result_analysis(number_of_records = 10):
  record_idx=set()
  reviews= []
  summary=[]
  generated_summary = []

  df_index = testing_data.index.values.tolist()
  while len(record_idx)!=number_of_records:
    random = np.random.randint(0,len(testing_data))
    if random not in record_idx:
      record_idx.add(random)
      random =df_index[random]
      summary.append(testing_data.Summary[random])
      reviews.append(testing_data.Text[random])

  get_summary_from_S2S_model(reviews,generated_summary, summary)

  return reviews, summary, generated_summary

In [None]:
with open('vocab2int.json') as f:
  vocab_to_int = json.load(f)
with open('int2vocab.json') as f:
  int_to_vocab_ = json.load(f)
print(vocab_to_int)

int_to_vocab={}
for key,value in int_to_vocab_.items():
  k = int(key)
  int_to_vocab[k] = value
print(int_to_vocab)

In [None]:
reviews, summary, generated_summary = result_analysis(200)

In [None]:
from datasets import load_metric
r = load_metric("rouge")

In [None]:
rouge_matrix = r.compute(predictions=summary, references=generated_summary)
rouge_matrix

In [None]:
key, p,re,f=[],[],[],[]
for k, value in rouge_matrix.items():
  key+=[k]
  p+=[value.high.precision]
  re+=[value.high.recall]
  f+=[value.high.fmeasure]


In [None]:
data = pd.DataFrame([key,p,re,f])
data = data.transpose()
data.columns=['Rouge','Precision','Recall','F-1']