In [1]:
!sudo apt-get install libpoppler-cpp-dev
!pip install pdftotext
!pip install pdfminer.six
!pip install -U textblob
!pip install multi_rake
!pip install rouge-score
!pip install transformers

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libpoppler-cpp0v5
The following NEW packages will be installed:
  libpoppler-cpp-dev libpoppler-cpp0v5
0 upgraded, 2 newly installed, 0 to remove and 39 not upgraded.
Need to get 36.7 kB of archives.
After this operation, 188 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libpoppler-cpp0v5 amd64 0.62.0-2ubuntu2.12 [28.0 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libpoppler-cpp-dev amd64 0.62.0-2ubuntu2.12 [8,676 B]
Fetched 36.7 kB in 0s (90.1 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
deb

In [5]:
!mkdir GloVe
!curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip GloVe/glove.840B.300d.zip -d GloVe/

!mkdir encoder
!curl -Lo encoder/infersent1.pkl https://dl.fbaipublicfiles.com/infersent/infersent1.pkl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0   315    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   352    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 2075M  100 2075M    0     0  5145k      0  0:06:53  0:06:53 --:--:-- 5565k
Archive:  GloVe/glove.840B.300d.zip
  inflating: GloVe/glove.840B.300d.txt  
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  18.4M      0  0:00:07  0:00:07 --:--:-- 25.1M


In [6]:
#Facebook infersent official helper class
import numpy as np
import time
from transformers import pipeline
import torch
import torch.nn as nn
from multi_rake import Rake



class InferSent(nn.Module):

    def __init__(self, config):
        super(InferSent, self).__init__()
        self.bsize = config['bsize']
        self.word_emb_dim = config['word_emb_dim']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.pool_type = config['pool_type']
        self.dpout_model = config['dpout_model']
        self.version = 1 if 'version' not in config else config['version']

        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
                                bidirectional=True, dropout=self.dpout_model)

        assert self.version in [1, 2]
        if self.version == 1:
            self.bos = '<s>'
            self.eos = '</s>'
            self.max_pad = True
            self.moses_tok = False
        elif self.version == 2:
            self.bos = '<p>'
            self.eos = '</p>'
            self.max_pad = False
            self.moses_tok = True

    def is_cuda(self):
        # either all weights are on cpu or they are on gpu
        return self.enc_lstm.bias_hh_l0.data.is_cuda

    def forward(self, sent_tuple):
        # sent_len: [max_len, ..., min_len] (bsize)
        # sent: (seqlen x bsize x worddim)
        sent, sent_len = sent_tuple

        # Sort by length (keep idx)
        sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
        sent_len_sorted = sent_len_sorted.copy()
        idx_unsort = np.argsort(idx_sort)

        idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_sort)
        sent = sent.index_select(1, idx_sort)

        # Handling padding in Recurrent Networks
        sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted)
        sent_output = self.enc_lstm(sent_packed)[0]  # seqlen x batch x 2*nhid
        sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0]

        # Un-sort by length
        idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_unsort)
        sent_output = sent_output.index_select(1, idx_unsort)

        # Pooling
        if self.pool_type == "mean":
            sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda()
            emb = torch.sum(sent_output, 0).squeeze(0)
            emb = emb / sent_len.expand_as(emb)
        elif self.pool_type == "max":
            if not self.max_pad:
                sent_output[sent_output == 0] = -1e9
            emb = torch.max(sent_output, 0)[0]
            if emb.ndimension() == 3:
                emb = emb.squeeze(0)
                assert emb.ndimension() == 2

        return emb

    def set_w2v_path(self, w2v_path):
        self.w2v_path = w2v_path

    def get_word_dict(self, sentences, tokenize=True):
        # create vocab of words
        word_dict = {}
        sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences]
        for sent in sentences:
            for word in sent:
                if word not in word_dict:
                    word_dict[word] = ''
        word_dict[self.bos] = ''
        word_dict[self.eos] = ''
        return word_dict

    def get_w2v(self, word_dict):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with w2v vectors
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if word in word_dict:
                    word_vec[word] = np.fromstring(vec, sep=' ')
        print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict)))
        return word_vec

    def get_w2v_k(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with k first w2v vectors
        k = 0
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if k <= K:
                    word_vec[word] = np.fromstring(vec, sep=' ')
                    k += 1
                if k > K:
                    if word in [self.bos, self.eos]:
                        word_vec[word] = np.fromstring(vec, sep=' ')

                if k > K and all([w in word_vec for w in [self.bos, self.eos]]):
                    break
        return word_vec

    def build_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        word_dict = self.get_word_dict(sentences, tokenize)
        self.word_vec = self.get_w2v(word_dict)
        print('Vocab size : %s' % (len(self.word_vec)))

    # build w2v vocab with k most frequent words
    def build_vocab_k_words(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        self.word_vec = self.get_w2v_k(K)
        print('Vocab size : %s' % (K))

    def update_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'warning : w2v path not set'
        assert hasattr(self, 'word_vec'), 'build_vocab before updating it'
        word_dict = self.get_word_dict(sentences, tokenize)

        # keep only new words
        for word in self.word_vec:
            if word in word_dict:
                del word_dict[word]

        # udpate vocabulary
        if word_dict:
            new_word_vec = self.get_w2v(word_dict)
            self.word_vec.update(new_word_vec)
        else:
            new_word_vec = []
        print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec)))

    def get_batch(self, batch):
        # sent in batch in decreasing order of lengths
        # batch: (bsize, max_len, word_dim)
        embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim))

        for i in range(len(batch)):
            for j in range(len(batch[i])):
                embed[j, i, :] = self.word_vec[batch[i][j]]

        return torch.FloatTensor(embed)

    def tokenize(self, s):
        from nltk.tokenize import word_tokenize
        if self.moses_tok:
            s = ' '.join(word_tokenize(s))
            s = s.replace(" n't ", "n 't ")  # HACK to get ~MOSES tokenization
            return s.split()
        else:
            return word_tokenize(s)

    def prepare_samples(self, sentences, bsize, tokenize, verbose):
        sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else
                     [self.bos] + self.tokenize(s) + [self.eos] for s in sentences]
        n_w = np.sum([len(x) for x in sentences])

        # filters words without w2v vectors
        for i in range(len(sentences)):
            s_f = [word for word in sentences[i] if word in self.word_vec]
            if not s_f:
                import warnings
                warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \
                               Replacing by "</s>"..' % (sentences[i], i))
                s_f = [self.eos]
            sentences[i] = s_f

        lengths = np.array([len(s) for s in sentences])
        n_wk = np.sum(lengths)
        if verbose:
            print('Nb words kept : %s/%s (%.1f%s)' % (
                        n_wk, n_w, 100.0 * n_wk / n_w, '%'))

        # sort by decreasing length
        lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths)
        sentences = np.array(sentences)[idx_sort]

        return sentences, lengths, idx_sort

    def encode(self, sentences, bsize=64, tokenize=True, verbose=False):
        tic = time.time()
        sentences, lengths, idx_sort = self.prepare_samples(
                        sentences, bsize, tokenize, verbose)

        embeddings = []
        for stidx in range(0, len(sentences), bsize):
            batch = self.get_batch(sentences[stidx:stidx + bsize])
            if self.is_cuda():
                batch = batch.cuda()
            with torch.no_grad():
                batch = self.forward((batch, lengths[stidx:stidx + bsize])).data.cpu().numpy()
            embeddings.append(batch)
        embeddings = np.vstack(embeddings)

        # unsort
        idx_unsort = np.argsort(idx_sort)
        embeddings = embeddings[idx_unsort]

        if verbose:
            print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % (
                    len(embeddings)/(time.time()-tic),
                    'gpu' if self.is_cuda() else 'cpu', bsize))
        return embeddings

    def visualize(self, sent, tokenize=True):

        sent = sent.split() if not tokenize else self.tokenize(sent)
        sent = [[self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]]

        if ' '.join(sent[0]) == '%s %s' % (self.bos, self.eos):
            import warnings
            warnings.warn('No words in "%s" have w2v vectors. Replacing \
                           by "%s %s"..' % (sent, self.bos, self.eos))
        batch = self.get_batch(sent)

        if self.is_cuda():
            batch = batch.cuda()
        output = self.enc_lstm(batch)[0]
        output, idxs = torch.max(output, 0)
        # output, idxs = output.squeeze(), idxs.squeeze()
        idxs = idxs.data.cpu().numpy()
        argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]

        # visualize model
        import matplotlib.pyplot as plt
        x = range(len(sent[0]))
        y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
        plt.xticks(x, sent[0], rotation=45)
        plt.bar(x, y)
        plt.ylabel('%')
        plt.title('Visualisation of words importance')
        plt.show()

        return output, idxs

In [8]:
from nltk.corpus import stopwords
import re
from collections import Counter
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')

def clean(s):
    lemmatizer = nltk.wordnet.WordNetLemmatizer()
    stop_words = stopwords.words('english')
    words = set(nltk.corpus.words.words())
    stopwords_dict = Counter(stop_words)

    s = re.sub(r"http\S+", '', s)
    s = re.sub('\n', ' ', s)
    s = re.sub("[\(\[].*?[\)\]]", "", s)
    s = re.sub(r'.*?(?=Abstract)','',s,1)
    s = re.sub("Conclusion.*", "", s)
    s = s.replace('- ', '')
    s = s.replace('– ', ' ')
    s = re.sub("\s\s+" , " ", s)
    s = ' '.join([word for word in s.split() if word not in stopwords_dict])
    s = " ".join([lemmatizer.lemmatize(w) for w in s.split(' ')]).strip()
    s = " ".join(w for w in nltk.wordpunct_tokenize(s) if w.lower() in words or not w.isalpha() or len(w.lower()) > 2)

    return s

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [9]:
x = convert_pdf_to_txt("data.pdf")
text_str = clean(x)
print(text_str)

Abstract In paper , propose novel neural network model called RNN Encoder Decoder consists two recurrent neural network . One RNN encodes sequence symbol ﬁxedlength vector representation , decodes representation another sequence symbols . The encoder decoder proposed model jointly trained maximize conditional probability target sequence given source sequence . The performance statistical machine translation system empirically found improve using conditional probability phrase pair computed RNN Encoder – Decoder additional feature existing log - linear model . Qualitatively , show proposed model learns semantically syntactically meaningful representation linguistic phrases . 1 Introduction Deep neural network shown great success various application objection recognition ) speech recognition ). Furthermore , many recent work showed neural network successfully used number task natural language processing . These include , limited to , language modeling , paraphrase detection word embeddin

In [20]:
from textblob import TextBlob
blob_object = TextBlob(text_str)
sent = list(blob_object.sentences)
print(sent)
print(len(sent))

[Sentence("Abstract In paper , propose novel neural network model called RNN Encoder Decoder consists two recurrent neural network ."), Sentence("One RNN encodes sequence symbol ﬁxedlength vector representation , decodes representation another sequence symbols ."), Sentence("The encoder decoder proposed model jointly trained maximize conditional probability target sequence given source sequence ."), Sentence("The performance statistical machine translation system empirically found improve using conditional probability phrase pair computed RNN Encoder – Decoder additional feature existing log - linear model ."), Sentence("Qualitatively , show proposed model learns semantically syntactically meaningful representation linguistic phrases ."), Sentence("1 Introduction Deep neural network shown great success various application objection recognition ) speech recognition )."), Sentence("Furthermore , many recent work showed neural network successfully used number task natural language process

In [21]:
import re
import numpy as np
from nltk import sent_tokenize, word_tokenize
from nltk.cluster.util import cosine_distance

MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)

def normalize_whitespace(text):
    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)


def _replace_whitespace(match):
    text = match.group()
    if "\n" in text or "\r" in text:
        return "\n"
    else:
        return " "


def is_blank(string):
    return not string or string.isspace()


def get_symmetric_matrix(matrix):
    return matrix + matrix.T - np.diag(matrix.diagonal())


def core_cosine_similarity(vector1, vector2):
    return 1 - cosine_distance(vector1, vector2)


class TextRank4Sentences():
    def __init__(self):
        self.damping = 0.85 
        self.min_diff = 1e-5
        self.steps = 100 
        self.text_str = None
        self.sentences = None
        self.pr_vector = None

    def _sentence_similarity(self, sent1, sent2, stopwords=None):
        if stopwords is None:
            stopwords = []

        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        for w in sent1:
            if w in stopwords:
                continue
            vector1[all_words.index(w)] += 1

        for w in sent2:
            if w in stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return core_cosine_similarity(vector1, vector2)

    def _build_similarity_matrix(self, sentences, stopwords=None):
        sm = np.zeros([len(sentences), len(sentences)])

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:
                    continue

                sm[idx1][idx2] = self._sentence_similarity(sentences[idx1], sentences[idx2], stopwords=stopwords)

        sm = get_symmetric_matrix(sm)

        norm = np.sum(sm, axis=0)
        sm_norm = np.divide(sm, norm, where=norm != 0)

        return sm_norm

    def _run_page_rank(self, similarity_matrix):

        pr_vector = np.array([1] * len(similarity_matrix))

        previous_pr = 0
        for epoch in range(self.steps):
            pr_vector = (1 - self.damping) + self.damping * np.matmul(similarity_matrix, pr_vector)
            if abs(previous_pr - sum(pr_vector)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr_vector)

        return pr_vector

    def _get_sentence(self, index):

        try:
            return self.sentences[index]
        except IndexError:
            return ""

    def get_top_sentences(self, number=5):

        top_sentences = []

        if self.pr_vector is not None:

            sorted_pr = np.argsort(self.pr_vector)
            sorted_pr = list(sorted_pr)
            sorted_pr.reverse()

            index = 0
            for epoch in range(number):
                sent = self.sentences[sorted_pr[index]]
                sent = normalize_whitespace(sent)
                top_sentences.append(sent)
                index += 1

        return top_sentences

    def analyze(self, text, stop_words=None):
        self.text_str = text
        self.sentences = sent_tokenize(self.text_str)

        tokenized_sentences = [word_tokenize(sent) for sent in self.sentences]

        similarity_matrix = self._build_similarity_matrix(tokenized_sentences, stop_words)

        self.pr_vector = self._run_page_rank(similarity_matrix)


tr4sh = TextRank4Sentences()
tr4sh.analyze(text_str)
print(len(sent))
sentences = tr4sh.get_top_sentences(len(sent))
print(len(sentences))

280
280


In [22]:
model_version = 1
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))
use_cuda = True
model = model.cuda() if use_cuda else model
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [23]:
imp = []
pow = ["accordingly", "furthermore", "moreover", "similarly", "also", "hence", "namely", "still", "anyway", "however", "nevertheless", "then", "besides", "incidentally", "next", "thereafter", "certainly", "indeed", "nonetheless", "therefore", "consequently", "instead", "now", "thus", "finally", "likewise", "otherwise", "undoubtedly", "further", "meanwhile"]

for i in pow:
  for j in sentences:
    if i in j:
      imp.append(j)

In [24]:
print(imp)
print(len(imp))

['For source phrase , generated 50 sample show top - ﬁve phrase accordingly scores .', '2 These probability considered additional feature log - linear model ) weighted accordingly maximize BLEU score .', 'similarly .', 'Interestingly , many phrase pair scored similarly translation model RNN Encoder – Decoder , many phrase pair scored radically different .', 'p similarly , conditional distribution next symbol P = g , − 1 , c .', 'This act similarly memory cell LSTM network help RNN remember longterm information .', '2 Neural Language Model In order ass effectiveness scoring phrase pair proposed RNN Encoder Decoder , also tried traditional approach using neural network learning target language model .', 'Since proposed RNN Encoder – Decoder also project map back sequence word continuous space vector , expect see similar property proposed model well .', '1 , also conditioned − 1 summary c input sequence .', '3 Hidden Unit Adaptively Remembers Forgets In addition novel model architecture ,

In [25]:
embeddings = model.encode(imp, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 517/548 (94.3%)
Speed : 211.5 sentences/s (gpu mode, bsize=128)
nb sentences encoded : 27




In [26]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [42]:
research_problem = {}
research_kw =[]
research_stmt = []
for i in sentences:
  research_problem[i] = round(cosine(model.encode(i)[0], model.encode(['research problem issue'])[0]), 2)

sorted_x = dict(sorted(research_problem.items(), key=lambda kv: kv[1]))
for i, j in sorted_x.items():
  if j > list(set(sorted_x.values()))[-3]:
    rake = Rake()
    keywords = rake.apply(i)
    research_kw.append(keywords[:3])
    research_stmt.append(i)

print(research_kw)
print(research_stmt)



[[], [('source phrase', 4.0), ('rnn encoder', 4.0), ('case', 1.0)], [('cases', 1.0)], [('output decoder', 4.0), ('starting input', 4.0), ('case', 1.0)], [('interestingly', 1.0), ('decoder', 1.0)], [], [('case', 1.0)], [('importantly', 1.0)], [('rescore translation hypothesis', 9.0), ('neural network', 4.0), ('cases', 1.0)], [('task', 1.0)], [('shortlisted word', 4.0), ('paper', 1.0)], [], [('unit gating units', 9.0), ('preliminary experiments', 4.0), ('found crucial', 4.0)], [('based neural network', 9.0), ('size input ﬁxed', 8.5), ('size output', 4.5)], []]
['?', 'In case , given source phrase , RNN Encoder – Decoder need generate list target phrases .', 'In cases , choice target phrase RNN Encoder – Decoder closer actual literal translations .', 'In case , output decoder , starting input , differentiable , use gradient - based algorithm estimate model parameters .', 'Interestingly , many phrase pair scored similarly translation model RNN Encoder – Decoder , many phrase pair scored ra

In [43]:
apprch = {}
apprch_kw = []
approach_stmt = []
for i in sentences:
  apprch[i] = round(cosine(model.encode(i)[0], model.encode(['used approach tasks steps algorithm method procedure technique process style'])[0]), 2)

sorted_x = dict(sorted(apprch.items(), key=lambda kv: kv[1]))
for i, j in sorted_x.items():
  if j > list(set(sorted_x.values()))[-3]:
        rake = Rake()
        keywords = rake.apply(i)
        apprch_kw.append(keywords[:2])
        approach_stmt.append(i)

print(apprch_kw)
print(approach_stmt)



[[('01 0', 0)], [('recurrent weight parameters', 9.0)], [], [('learned distribution', 4.0), ('straightforward sample', 4.0)], []]
['01 0 .', '01 , except recurrent weight parameters .', '01 , model trained validation perplexity improve 10 epochs .', '= f − 1 , − 1 , c , From learned distribution , straightforward sample new sequence iteratively sampling symbol time step .', 'x − 1 input previous hidden state , respectively .']


In [44]:
expt = {}
expt_kw = []
expt_stmt = []
for i in sentences:
  expt[i] = round(cosine(model.encode(i)[0], model.encode(["experimental setup test investigation examination experimentation testing"])[0]), 2)

sorted_x = dict(sorted(expt.items(), key=lambda kv: kv[1]))
for i, j in sorted_x.items():
  if j > list(set(sorted_x.values()))[-3]:
        rake = Rake()
        keywords = rake.apply(i)
        expt_kw.append(keywords[:2])
        expt_stmt.append(i)

print(expt_kw)
print(expt_stmt)



[[('01 0', 0)], [('recurrent weight parameters', 9.0)], [], [('source phrase', 4.0), ('rnn encoder', 4.0)], [('cases', 1.0)], [('output decoder', 4.0), ('starting input', 4.0)], [('interestingly', 1.0), ('decoder', 1.0)], [], [('case', 1.0)], [('importantly', 1.0)], [('rescore translation hypothesis', 9.0), ('neural network', 4.0)], [('task', 1.0)], [('shortlisted word', 4.0), ('paper', 1.0)], [], [('unit gating units', 9.0), ('preliminary experiments', 4.0)], [('based neural network', 9.0), ('size input ﬁxed', 8.5)], []]
['01 0 .', '01 , except recurrent weight parameters .', '01 , model trained validation perplexity improve 10 epochs .', 'In case , given source phrase , RNN Encoder – Decoder need generate list target phrases .', 'In cases , choice target phrase RNN Encoder – Decoder closer actual literal translations .', 'In case , output decoder , starting input , differentiable , use gradient - based algorithm estimate model parameters .', 'Interestingly , many phrase pair scored s

In [45]:
res = {}
res_kw = []
result_stmt = []
for i in sentences:
  res[i] = round(cosine(model.encode(i)[0], model.encode(["result output consequence outcome conclusion product solution decision opinion findings answer solution"])[0]), 2),

sorted_x = dict(sorted(res.items(), key=lambda kv: kv[1]))
for i, j in sorted_x.items():
  if j > list(set(sorted_x.values()))[-3]:
        rake = Rake()
        keywords = rake.apply(i)
        res_kw.append(keywords[:2])
        result_stmt.append(i)

print(res_kw)
print(result_stmt)



[[('source phrase', 4.0), ('rnn encoder', 4.0)], [('cases', 1.0)], [('output decoder', 4.0), ('starting input', 4.0)], [('interestingly', 1.0), ('decoder', 1.0)], [], [('case', 1.0)], [('importantly', 1.0)], [('rescore translation hypothesis', 9.0), ('neural network', 4.0)], [('task', 1.0)], [('shortlisted word', 4.0), ('paper', 1.0)], [], [('unit gating units', 9.0), ('preliminary experiments', 4.0)], [('based neural network', 9.0), ('size input ﬁxed', 8.5)], [], [('01 0', 0)], [('recurrent weight parameters', 9.0)], []]
['In case , given source phrase , RNN Encoder – Decoder need generate list target phrases .', 'In cases , choice target phrase RNN Encoder – Decoder closer actual literal translations .', 'In case , output decoder , starting input , differentiable , use gradient - based algorithm estimate model parameters .', 'Interestingly , many phrase pair scored similarly translation model RNN Encoder – Decoder , many phrase pair scored radically different .', 'In , feedforward ne

In [46]:
tex = "".join(research_stmt) + "".join(approach_stmt) + "".join(expt_stmt) + "".join(result_stmt)
print(tex)

?In case , given source phrase , RNN Encoder – Decoder need generate list target phrases .In cases , choice target phrase RNN Encoder – Decoder closer actual literal translations .In case , output decoder , starting input , differentiable , use gradient - based algorithm estimate model parameters .Interestingly , many phrase pair scored similarly translation model RNN Encoder – Decoder , many phrase pair scored radically different .In , feedforward neural network trained learn mapping bag - of - words representation input phrase output phrase .In case , output timestep conditional distribution p .Importantly , generated phrase overlap completely target phrase phrase table .In many cases , neural network used rescore translation hypothesis ).Instead , one focus relevant subset data given task .It possible address issue backing existing model contain non - shortlisted word ) In paper , however , opt introducing word penalty instead , counteracts word probability overestimation .In ﬁeld s

In [53]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
summarizer = pipeline("summarization")
summary_text = summarizer(tex, max_length=int(len(tex.split(" "))*0.8), min_length=int(len(tex.split(" "))*0.4), do_sample=False)[0]['summary_text']
print(summary_text)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


 Many phrase pair scored similarly translation model RNN Encoder – Decoder – Decoder . In paper, however, opt introducing word penalty instead, counteracts word probability overestimation . In ﬁeld statistical machine machine machine translation , deep neural network begun show promising results . In study, opt introduces word penalty, instead of word penalty to counteract word probability overrearability . In research paper, opt to introduce word penalty for translation model . In report, many phrase pairs scored similar translation model, many phrases scored radically different . It is possible that the existing model contains non - shortlisted word (non - shortlisted word) In paper , however , opt to replace word penalty with word word for translation .


In [52]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
abstract = "In this paper, we propose a novel neural network model called RNN Encoder–Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixedlength vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder–Decoder as an additional feature in the existing log-linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases." 
scores = scorer.score(abstract, summary_text)
print(scores)


{'rouge1': Score(precision=0.23529411764705882, recall=0.39097744360902253, fmeasure=0.29378531073446323), 'rouge2': Score(precision=0.05454545454545454, recall=0.09090909090909091, fmeasure=0.06818181818181819), 'rougeL': Score(precision=0.12217194570135746, recall=0.20300751879699247, fmeasure=0.15254237288135591)}
