In [1]:
tokens = [(3, 'new', 'JJ'), (5, 'ambitious', 'JJ'), (6, 'framework', 'NN'), (8, 'abstractive', 'JJ'), (9, 'summarization', 'NN'), (16, 'content', 'NN'), (19, 'summary', 'JJ'), (22, 'sentences', 'NNS'), (27, 'abstract', 'JJ'), (28, 'representation', 'NN'), (31, 
'source', 'NN'), (32, 'documents', 'NNS'), (35, 'abstract', 'JJ'), (36, 'representation', 'NN'), (37, 'relies', 'NNS'), (40, 'concept', 'NN'), (42, 'information', 'NN'), (43, 'items', 'NNS'), (45, 'init', 'NN'), (53, 'smallest', 'JJS'), (54, 'element', 'NN'), (56, 'coherent', 'JJ'), (57, 'information', 'NN'), (60, 'text', 'NN'), (63, 
'sentence', 'NN'), (66, 'framework', 'NN'), (67, 'differs', 'NNS'), (69, 'previous', 'JJ'), (70, 'abstractive', 'JJ'), (71, 'summarization', 'NN'), (72, 'models', 'NNS'), (76, 'semantic', 'JJ'), (77, 'analysis', 'NN'), (80, 'text', 'NN'), (85, 'first', 'JJ'), (86, 'attempt', 'NN'), (91, 'system', 'NN'), (94, 'framework', 'NN'), (98, 'evaluation', 'NN'), (99, 'results', 'NNS'), (103, 'tac', 'JJ'), (108, 'related', 'JJ'), (109, 'work', 'NN'), (118, 'automatic', 'JJ'), (119, 'summarization', 'NN'), (120, 'domain', 'NN')]

In [2]:
from collections import Counter

In [3]:
NOUN_GROUP = ['NN', 'NNS', 'NNP', 'NNPS'] # 4
PRONOUN_GROUP = ['PRP', 'PRP$', 'WP', 'WP$'] # 4
VERB_GROUP = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] # 6
ADJECTIVE_GROUP = ['JJ', 'JJR', 'JJS'] # 3
ADVERB_GROUP = ['RB', 'RBR', 'RBS', 'WRB']
PREPOSITION_GROUP = ['IN']
CONJUNCTION_GROUP = ['CC', 'IN']
INTERJECTION_GROUP = ['UH']

In [4]:
class LexicalGraph(object):
    def __init__(self, filtered_tokens, N):
        self.total_cnt = len(filtered_tokens)
        words = [t[1] for t in filtered_tokens]
        unique_words = list(Counter(words))
        self.unique_cnt = len(unique_words)
        self.T = self.unique_cnt // 3
        self.conversion = {unique_words[i] : i for i in range(len(unique_words))}
        self.V = {self.conversion[v] : 1 for v in unique_words}
        self.E = {self.conversion[v] : [] for v in unique_words}
        self.jump_factor = 0.85
        self.threshold = 0.0001

        for i in range(self.total_cnt):
            token = filtered_tokens[i]
            for j in range(N):
                if i + j + 1 >= self.total_cnt:
                    break
                else:
                    next_token = filtered_tokens[i + j + 1]
                    if token[0] + N < next_token[0]:
                        break
                    else:
                        idx = self.conversion[token[1]]
                        next_idx = self.conversion[next_token[1]]
                        if next_idx not in self.E[idx]:
                            self.E[idx].append(next_idx)
                            self.E[next_idx].append(idx)

        self.conversion = {v : k for k, v in self.conversion.items()}
        
    def score_of(self, word):
        neighbor_list = self.E[word]
        temp = 0
        for neighbor in neighbor_list:
            temp += self.V[neighbor] / len(self.E[neighbor])
        return (1 - self.jump_factor) + self.jump_factor * temp

    def calculate_textrank(self):
        flags = [False for i in range(self.unique_cnt)]
        i = 0
        iter_cnt = 0
        while not all(flags):
            prev_score = self.V[i]
            curr_score = self.score_of(i)
            self.V[i] = curr_score
            if abs(prev_score - curr_score) < self.threshold:
                flags[i] = True
            i = (i + 1) % self.unique_cnt
            if i == 0:
                iter_cnt += 1
        return iter_cnt

In [5]:
graph = LexicalGraph(tokens, 2)

In [6]:
print(graph.total_cnt, graph.unique_cnt, graph.T)

46 37 12


In [7]:
iter_cnt = graph.calculate_textrank()
iter_cnt

19

In [8]:
rev_sorted_scores = sorted(graph.V.items(), key=lambda x : x[1], reverse=True)

[(4, 1.7234395460349212), (14, 1.6397522169176293), (3, 1.4023515584247277), (2, 1.195749513261997), (18, 1.175871169409576), (15, 1.1617901548644278), (19, 1.1143417084590648), (23, 1.0786637438486126), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (25, 1.0), (26, 1.0), (27, 1.0), (28, 1.0), (30, 1.0), (31, 1.0), (33, 1.0), (34, 1.0), (1, 0.9649109431072667), (22, 0.7944581516003275), (35, 0.7704598904905071), (36, 0.7704468348656691), (24, 0.7410190294184734), (17, 0.6497407581380162), (16, 0.6437608158173818), (13, 0.6145922418437773), (0, 0.5601112541042375), (5, 0.15000000000000002), (6, 0.15000000000000002), (7, 0.15000000000000002), (20, 0.15000000000000002), (21, 0.15000000000000002), (29, 0.15000000000000002), (32, 0.15000000000000002)]


In [9]:
import math
potential_keywords = []
potential_keywords_score = []
cur_score = math.inf
for i in range(graph.T):
    cur_score = rev_sorted_scores[i][1]
    word = graph.conversion[rev_sorted_scores[i][0]]
    potential_keywords.append(word)
    potential_keywords_score.append(cur_score)
print(potential_keywords, potential_keywords_score)

['summarization', 'information', 'abstractive', 'framework', 'element', 'items', 'coherent', 'previous', 'abstract', 'representation', 'source', 'documents'] [1.7234395460349212, 1.6397522169176293, 1.4023515584247277, 1.195749513261997, 1.175871169409576, 1.1617901548644278, 1.1143417084590648, 1.0786637438486126, 1.0, 1.0, 1.0, 1.0]


In [10]:
n_2 = ['summarization', 'information', 'abstractive', 'framework', 'element', 'items', 'coherent', 'previous', 'abstract', 'representation', 'source', 'documents']
n_3 = ['framework', 'information', 'summarization', 'abstractive', 'summary', 'previous', 'text', 'abstract', 'representation', 'relies', 'element', 'coherent']
n_4 = ['framework', 'summarization', 'information', 'summary', 'abstractive', 'representation', 'text', 'differs', 'previous', 'abstract', 'concept', 'coherent']
n_5 = ['framework', 'summarization', 'information', 'abstract', 'abstractive', 'text', 'concept', 'models', 'differs', 'representation', 'relies', 'previous']
n_6 = ['framework', 'summarization', 'information', 'text', 'abstractive', 'representation', 'relies', 'models', 'abstract', 'tac', 'concept', 'differs']
n_7 = ['summarization', 'framework', 'text', 'information', 'abstractive', 'representation', 'abstract', 'relies', 'models', 'sentence', 'differs', 'tac']
n_8 = ['summarization', 'framework', 'text', 'information', 'abstractive', 'representation', 'abstract', 'relies', 'models', 'sentence', 'differs', 'tac']
n_9 = ['framework', 'summarization', 'text', 'representation', 'information', 'abstract', 'abstractive', 'sentence', 'previous', 'models', 'relies', 'concept']
n_10 = ['framework', 'summarization', 'text', 'information', 'abstractive', 'abstract', 'representation', 'sentence', 'semantic', 'differs', 'analysis', 'previous'] 

In [11]:
def combine_multi_word_keyword(potential_keywords, filtered_tokens):
    relation = [t for t in filtered_tokens if t[1] in potential_keywords]
    keywords = []
    i = 0
    while i < len(relation):
        idx = relation[i][0]
        keyword = relation[i][1]
        flag = True
        j = i + 1
        while flag and j < len(relation):
            next_idx = relation[j][0]
            if next_idx == idx + 1:
                keyword += ' ' + relation[j][1]
                idx = next_idx
                j += 1
            else:
                flag = False
        i = j
        if keyword not in keywords:
            print (keyword)
            keywords.append(keyword)
    return keywords

In [12]:
final_keywords = combine_multi_word_keyword(potential_keywords, tokens)
final_keywords

framework
abstractive summarization
abstract representation
source documents
information items
element
coherent information
previous abstractive summarization
summarization


['framework',
 'abstractive summarization',
 'abstract representation',
 'source documents',
 'information items',
 'element',
 'coherent information',
 'previous abstractive summarization',
 'summarization']

# fixed abstract, N; variation in tags
KW: deep learning; DL; natural language processing; nlp; deep generative modeling; summarize; model(s); different application domain;

All: deep learning methods possess many processing; have; results; domains; deep learning; natural; processing; nlp; presents; brief; deep; work considers most; many deep learning models; text; models; forward; past; explored; different
P = 9 / 10; R = 9 / 19; F = 18 / 29

N: learning; representation; data; domains; learning model designs; language processing; learning models; generation; models; generation models
P = 8 / 9; R = 4 / 5; F = 16 / 19

NV: learning; processing layers; have; domains; designs; architectures have; processing; survey presents; work considers; learning models; have been; models; have been explored; nlp; survey
P = 5 / 6; R = 1 / 3; F = 10 / 21

NJ: deep learning; many; representation; results; domains; designs; natural; processing; brief description; deep; many deep learning models; text; models; understanding; past
P = 7 / 8; R = 7 / 15; F = 14 / 23

# fixed abstarct, tag; variation in N
KW: deep learning; DL; natural language processing; nlp; deep generative modeling; summarize; model(s); different application domain;

2: deep learning; many; representation; results; domains; designs; natural; processing; brief description; deep; many deep learning models; text; models; understanding; past
P = 7 / 8; R = 7 / 15; F = 14 / 23

3: deep learning; many; stratified; domains; deep learning model designs; language; nlp; brief description; deep; papers; many deep learning models; generation; models; past; generation models
P = 9 / 10; R = 9 / 14; F = 3 / 4

5: deep learning; many; layers; domains; architectures; nlp; brief; deep; modeling; work; many deep learning models; generation; text; models; present; future; text generation models
P = 9 / 10; R = 9 / 17; F = 2 / 3

10: deep learning; many; stratified representation; several domains; designs; architectures; context; nlp; survey; deep; many deep learning models; generation; text; models; text generation models; domains
P = 10 / 11; 10 / 16; F = 20 / 27


In [2]:
a = ['This is a test', 'hi', 'This is fun', 'hello']
b = 'test'

for t in a:
    if b in t:
        print(1)

1
