# WebApp for Title Generation of Research Articles

#### **Downloading and Importing required libraries**

In [None]:
!pip install compress-pickle
!pip install rouge
!python -m spacy download en_core_web_md
!sudo apt install openjdk-8-jdk
!sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!pip install language-check
!pip install flask-ngrok

In [None]:
%tensorflow_version 1.x
import tensorflow as tf
from tensorflow.contrib import rnn

import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction
from rouge import Rouge 

from flask_ngrok import run_with_ngrok
from flask import Flask, request, render_template, redirect, url_for, send_from_directory, flash

import random
import collections
import compress_pickle as pickle
import re
import bz2
import os 
import time
import warnings

import numpy as np
import pandas as pd

from tqdm.notebook import trange,tqdm

import spacy
import en_core_web_md

nltk.download('punkt')
nlp = en_core_web_md.load()

#### **Necessary Utility functions**

In [None]:
default_path = "/Testing/"
dataset_path = "/Dataset/"

test_article_path = dataset_path + "abstract.test.bz2"
test_title_path   = dataset_path + "title.test.bz2"

def clean_str(sentence):
    sentence = re.sub("[#.]+", " ", sentence)
    return sentence


def get_text_list(data_path, toy=False,clean=True):
    with bz2.open (data_path, "r") as f:
        if not clean:
            return [x.decode().strip() for x in f.readlines()[5000:10000:5]]
        if not toy:
            return [clean_str(x.decode().strip()) for x in tqdm(f.readlines())]
        else:
            return [clean_str(x.decode().strip()) for x in tqdm(f.readlines()[:20000])]


def build_dict(step, toy=False,train_article_list=[],train_title_list=[]):
    if step == "test" or os.path.exists(default_path+"word_dict.bz"):
        with open(default_path+"word_dict.bz", "rb") as f:
            word_dict = pickle.load(f,compression='bz2')

    elif step == "train":
        words = list()
        for sentence in tqdm(train_article_list + train_title_list):
            for word in word_tokenize(sentence):
                words.append(word)

        word_counter = collections.Counter(words).most_common(500000)
        word_dict = dict()
        word_dict["<padding>"] = 0
        word_dict["<unk>"] = 1
        word_dict["<s>"] = 2
        word_dict["</s>"] = 3
        cur_len = 4
        for word, _ in tqdm(word_counter):
            word_dict[word] = cur_len
            cur_len += 1

        pickle.dump(word_dict, default_path+"word_dict",compression='bz2')
    
    reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))

    article_max_len = 250
    summary_max_len = 15

    return word_dict, reversed_dict, article_max_len, summary_max_len


def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]


#### **Title Modification ( OOV replacment and Grammar Check)**

In [None]:
tool = language_check.LanguageTool('en-US')
smoothing = SmoothingFunction().method0

def get_unk_tokens(word_dict, article):
    unk = defaultdict(float)
    tokens = word_tokenize(article)
    n = min(250,len(tokens))
    for i,token in enumerate(tokens[:250]):
        if token not in word_dict:
            unk[token]+= get_weight(i,n)
    tup = []
    for i in unk:
        tup.append((unk[i],i))
    return sorted(tup[:5],reverse=True)

def get_weight(index, token_len):
    p = index/token_len
    if(p<=0.1):
        return 0.35 
    if(p<=0.2):
        return 0.3
    if(p<=0.4):
        return 0.2
    if(p<=0.7):
        return 0.1
    return 0.05

def correct(text):
    matches = tool.check(text)
    text = language_check.correct(text, matches)
    return text

def update_title(word_dict,article, title):
    replace_count = 0
    unk_list = get_unk_tokens(word_dict, article)
    for j in range(min(title.count('<unk>'), len(unk_list))):
        title = title.replace('<unk>', unk_list[j][1],1)
        replace_count += 1
    return correct(title)

def calculate_bleu(title, reference):
    title_tok,reference_tok = word_tokenize(title), [word_tokenize(reference)]
    return sentence_bleu(reference_tok,title_tok,smoothing_function=smoothing)

#### **RNN Model Implementation**

In [None]:
class Model(object):
    def __init__(self, reversed_dict, article_max_len, summary_max_len, args, forward_only=False):
        self.vocabulary_size = len(reversed_dict)
        self.embedding_size = args.embedding_size
        self.num_hidden = args.num_hidden
        self.num_layers = args.num_layers
        self.learning_rate = args.learning_rate
        self.beam_width = args.beam_width
        if not forward_only:
            self.keep_prob = args.keep_prob
        else:
            self.keep_prob = 1.0
        self.cell = tf.nn.rnn_cell.LSTMCell
        
        with tf.variable_scope("decoder/projection"):
            self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False)

        self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
        self.X = tf.placeholder(tf.int32, [None, article_max_len])
        self.X_len = tf.placeholder(tf.int32, [None])
        self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
        self.decoder_len = tf.placeholder(tf.int32, [None])
        self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
        self.global_step = tf.Variable(0, trainable=False)

        with tf.name_scope("embedding"):
            if not forward_only and args.glove:
                init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size), dtype=tf.float32)
            else:
                init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
            self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings)
            self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2])
            self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.decoder_input), perm=[1, 0, 2])

        with tf.name_scope("encoder"):
            fw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            bw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells]
            bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells]

            encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                fw_cells, bw_cells, self.encoder_emb_inp,
                sequence_length=self.X_len, time_major=True, dtype=tf.float32)
            self.encoder_output = tf.concat(encoder_outputs, 2)
            encoder_state_c = tf.concat((encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
            encoder_state_h = tf.concat((encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
            self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

        with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope:
            decoder_cell = self.cell(self.num_hidden * 2)

            if not forward_only:
                attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
                initial_state = initial_state.clone(cell_state=self.encoder_state)
                helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_emb_inp, self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, scope=decoder_scope)
                self.decoder_output = outputs.rnn_output
                self.logits = tf.transpose(
                    self.projection_layer(self.decoder_output), perm=[1, 0, 2])
                self.logits_reshape = tf.concat(
                    [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1)
            else:
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width)
                tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, multiplier=self.beam_width)
                tiled_seq_len = tf.contrib.seq2seq.tile_batch(self.X_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
                initial_state = initial_state.clone(cell_state=tiled_encoder_final_state)
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=self.embeddings,
                    start_tokens=tf.fill([self.batch_size], tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer
                )
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope)
                self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])

        with tf.name_scope("loss"):
            if not forward_only:
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits_reshape, labels=self.decoder_target)
                weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32)
                self.loss = tf.reduce_sum(crossent * weights / tf.cast(self.batch_size,tf.float32))

                params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.update = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

#### **Cell for Title Generation**

In [None]:
class args:
    pass
  
args.num_hidden=200
args.num_layers=3
args.beam_width=10
args.embedding_size=300
args.glove = True

args.learning_rate=1e-3
args.batch_size=64
args.num_epochs=5
args.keep_prob = 0.8

args.toy=True
args.with_model="store_true"

word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("test", args.toy)
abstracts = get_text_list(test_article_path)
titles = get_text_list(test_title_path)
    
def generate_title(article):
    tf.reset_default_graph()
    model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True)
    saver = tf.train.Saver(tf.global_variables())
    ckpt = tf.train.get_checkpoint_state(default_path + "saved_model/")

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = [word_tokenize(clean_str(article))]
        x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x]
        x = [d[:article_max_len] for d in x]
        test_x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]

        with tf.Session() as sess:
            saver.restore(sess, ckpt.model_checkpoint_path)

            batches = batch_iter(test_x, [0] * len(test_x), args.batch_size, 1)

            
            for batch_x, _ in batches:
                batch_x_len = [len([y for y in x if y != 0]) for x in batch_x]

                test_feed_dict = {
                    model.batch_size: len(batch_x),
                    model.X: batch_x,
                    model.X_len: batch_x_len,
                }

                prediction = sess.run(model.prediction, feed_dict=test_feed_dict)
                prediction_output = [[reversed_dict[y] for y in x] for x in prediction[:, 0, :]]
                summary_array = []
                for line in prediction_output:
                    summary = list()
                    for word in line:
                        if word == "</s>":
                            break
                        if word not in summary:
                            summary.append(word)
                    summary_array.append(" ".join(summary))
                    return " ".join(summary)

def get_title(text):
    if text.count(' ')<10:
        raise Exception("The length of the abstract is very short. Output will not be good")
    title = generate_title(clean_str(text))
    updated_title, unk_list, replace_count = update_title(word_dict, text, title)
    result = {}
    result['text'] = text[:700]+'...'
    result['title'] = title
    result['updated_title'] = updated_title
    result['unk_list'] = unk_list
    result['replace_count'] = replace_count
    return result


def evaluate_title(text,title):
    title_gen = get_title(text)
    bleu_score = calculate_bleu(title_gen['updated_title'], title)
    result = {}
    result['text'] = text[:700]+'...'
    result['original'] = title
    result['generated'] = title_gen['updated_title']
    result['bleu'] = bleu_score
    return result


In [1]:
#### **Cell for WebApp**

In [None]:
app = Flask(__name__,template_folder='drive/My Drive/templates')
app.config['N'] = 0

app.config['SECRET_KEY'] = '5791628bb0b13ce0c676dfde280ba245'
run_with_ngrok(app)   #starts ngrok when the app is run


@app.route('/', methods=['GET', "POST"])
def home():
    if request.method == 'POST':
        abstract = request.form['text']

        # call predict
        # something like
        # result , unklist = model.predct(abstract = abstract)
        time_taken = time.time()
        try:
          response = get_title(abstract)
        except:
          response = None
        title = abstract
        time_taken = str(round(time.time() - time_taken, 3)) + " secs."
        unklist = ['test','a']
        if(response):
          result = {'time_taken': time_taken,
                    'title': response['updated_title'], 'unk-list': response['unk_list'],'replace-count' : response['replace_count']}
          return render_template('home2.html', abstract=abstract, result=result)
        else:
          flash('Too small Input for Title Generation','danger')
          return redirect(url_for("home"))
    return render_template('home2.html')


@app.route("/clear")
def clear():
    return render_template('home2.html')


@app.route("/random_sample/<route>")
def random_sample(route):
    app.config['N'] = random.randrange(len(abstracts))
    random_abstract = abstracts[app.config['N']]
    random_title = titles[app.config['N']]
    if(route == "evaluate"):
        return render_template("evaluate2.html", abstract=random_abstract, input_title = random_title)
    else:
        return render_template("home2.html", abstract=random_abstract)




@app.route("/evaluate", methods=['GET', "POST"])
def evaluate():
    if request.method == 'POST':
        abstract = request.form['asbtract']
        input_title = request.form['title']
        time_taken = time.time()
        # call predict
        # something like
        # result , unklist = model.predct(abstract = abstract)
        try:
          response = evaluate_title(abstract,input_title)
        except:
          response = None
        title = str(abstract)
        time_taken = str(round(time.time() - time_taken, 3)) + " secs."
        print(time_taken)
        bleu = None
        if(response):
          result = {'time_taken': time_taken, 'title': response['generated'], 'bleu': response['bleu']}
          return render_template('evaluate2.html', abstract=abstract, input_title= input_title, result=result)
        else:
          flash('Too small Input for Title Generation','danger')
          return redirect(url_for("evaluate"))
    return redirect(url_for('random_sample', route="evaluate"))


if __name__ == '__main__':

    # INit model here
    app.run()

