In [1]:
tokens = [(1, 'deep', 'JJ'), (2, 'learning', 'NN'), (3, 'methods', 'NNS'), (5, 'many', 'JJ'), (7, 'layers', 'NNS'), (11, 'stratified', 'JJ'), (12, 'representation', 'NN'), (14, 'data', 'NNS'), (18, 'state-of-art', 'JJ'), (19, 'results', 'NNS'), (21, 'several', 'JJ'), (22, 'domains', 'NNS'), (26, 'deep', 'JJ'), (27, 'learning', 'NN'), (28, 'model', 'NN'), (29, 'designs', 'NNS'), (31, 'architectures', 'NNS'), (36, 'context', 'NN'), (38, 'natural', 'JJ'), (39, 'language', 'NN'), (40, 'processing', 'NN'), (42, 'nlp', 'JJ'), (46, 'survey', 'NN'), (49, 'brief', 'JJ'), (50, 'description', 'NN'), (53, 'advances', 'NNS'), (59, 'area', 'NN'), (61, 'deep', 'JJ'), (62, 'generative', 'JJ'), (63, 'modeling', 'NN'), (66, 'work', 'NN'), (68, 'most', 'JJS'), (71, 'papers', 'NNS'), (74, 'onwards', 'NNS'), (78, 'paper', 'NN'), (82, 'many', 'JJ'), (83, 'deep', 'JJ'), (84, 'learning', 'NN'), (85, 'models', 'NNS'), (92, 'generation', 'NN'), (94, 'text', 'NN'), (100, 'various', 'JJ'), (101, 'models', 'NNS'), (107, 'detailed', 'JJ'), (108, 'understanding', 'NN'), (110, 'past', 'JJ'), (112, 'present', 'JJ'), (115, 'future', 'NN'), (117, 'text', 'JJ'), 
(118, 'generation', 'NN'), (119, 'models', 'NNS'), (121, 'deep', 'JJ'), (122, 'learning', 'NN'), (126, 'dl', 'JJ'), (127, 'approaches', 'NNS'), (135, 'different', 'JJ'), (136, 'application', 'NN'), (137, 'domains', 'NNS'), (139, 'nlp', 'NN'), (144, 'survey', 'NN')]

In [2]:
from collections import Counter

In [3]:
NOUN_GROUP = ['NN', 'NNS', 'NNP', 'NNPS'] # 4
PRONOUN_GROUP = ['PRP', 'PRP$', 'WP', 'WP$'] # 4
VERB_GROUP = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] # 6
ADJECTIVE_GROUP = ['JJ', 'JJR', 'JJS'] # 3
ADVERB_GROUP = ['RB', 'RBR', 'RBS', 'WRB']
PREPOSITION_GROUP = ['IN']
CONJUNCTION_GROUP = ['CC', 'IN']
INTERJECTION_GROUP = ['UH']

In [5]:
class LexicalGraph(object):
    def __init__(self, filtered_tokens, N):
        self.total_cnt = len(filtered_tokens)
        words = [t[1] for t in filtered_tokens]
        unique_words = list(Counter(words))
        self.unique_cnt = len(unique_words)
        self.T = self.unique_cnt // 3
        self.conversion = {unique_words[i] : i for i in range(len(unique_words))}
        self.V = {self.conversion[v] : 1 for v in unique_words}
        self.E = {self.conversion[v] : [] for v in unique_words}
        self.jump_factor = 0.85
        self.threshold = 0.0001

        for i in range(self.total_cnt):
            token = filtered_tokens[i]
            for j in range(N):
                if i + j + 1 >= self.total_cnt:
                    break
                else:
                    next_token = filtered_tokens[i + j + 1]
                    if token[0] + N < next_token[0]:
                        break
                    else:
                        idx = self.conversion[token[1]]
                        next_idx = self.conversion[next_token[1]]
                        if next_idx not in self.E[idx]:
                            self.E[idx].append(next_idx)
                            self.E[next_idx].append(idx)

        self.conversion = {v : k for k, v in self.conversion.items()}
        
    def score_of(self, word):
        neighbor_list = self.E[word]
        temp = 0
        for neighbor in neighbor_list:
            temp += self.V[neighbor] / len(self.E[neighbor])
        return (1 - self.jump_factor) + self.jump_factor * temp

    def calculate_textrank(self):
        flags = [False for i in range(self.unique_cnt)]
        i = 0
        iter_cnt = 0
        while not all(flags):
            prev_score = self.V[i]
            curr_score = self.score_of(i)
            self.V[i] = curr_score
            if abs(prev_score - curr_score) < self.threshold:
                flags[i] = True
            i = (i + 1) % self.unique_cnt
            if i == 0:
                iter_cnt += 1
        return iter_cnt

In [6]:
graph = LexicalGraph(tokens, 2)

In [7]:
print(graph.total_cnt, graph.unique_cnt, graph.T)

60 45 15


In [8]:
iter_cnt = graph.calculate_textrank()
iter_cnt

26

In [9]:
rev_sorted_scores = sorted(graph.V.items(), key=lambda x : x[1], reverse=True)

In [10]:
import math
potential_keywords = []
potential_keywords_score = []
cur_score = math.inf
for i in range(graph.T):
    cur_score = rev_sorted_scores[i][1]
    word = graph.conversion[rev_sorted_scores[i][0]]
    potential_keywords.append(word)
    potential_keywords_score.append(cur_score)
print(potential_keywords, potential_keywords_score)

['deep', 'learning', 'models', 'domains', 'representation', 'natural', 'many', 'understanding', 'past', 'processing', 'text', 'results', 'designs', 'brief', 'description'] [2.4950098391377393, 1.8376872138634366, 1.7177995436033875, 1.6775278267469873, 1.4594681225663324, 1.3428682881571992, 1.3097628441072358, 1.2982581840332954, 1.2982562985335644, 1.2792770549162067, 1.1857486012418885, 1.075582623255793, 1.0741440967831228, 1.0, 1.0]


In [11]:
n_2 = ['summarization', 'information', 'abstractive', 'framework', 'element', 'items', 'coherent', 'previous', 'abstract', 'representation', 'source', 'documents']
n_3 = ['framework', 'information', 'summarization', 'abstractive', 'summary', 'previous', 'text', 'abstract', 'representation', 'relies', 'element', 'coherent']
n_4 = ['framework', 'summarization', 'information', 'summary', 'abstractive', 'representation', 'text', 'differs', 'previous', 'abstract', 'concept', 'coherent']
n_5 = ['framework', 'summarization', 'information', 'abstract', 'abstractive', 'text', 'concept', 'models', 'differs', 'representation', 'relies', 'previous']
n_6 = ['framework', 'summarization', 'information', 'text', 'abstractive', 'representation', 'relies', 'models', 'abstract', 'tac', 'concept', 'differs']
n_7 = ['summarization', 'framework', 'text', 'information', 'abstractive', 'representation', 'abstract', 'relies', 'models', 'sentence', 'differs', 'tac']
n_8 = ['summarization', 'framework', 'text', 'information', 'abstractive', 'representation', 'abstract', 'relies', 'models', 'sentence', 'differs', 'tac']
n_9 = ['framework', 'summarization', 'text', 'representation', 'information', 'abstract', 'abstractive', 'sentence', 'previous', 'models', 'relies', 'concept']
n_10 = ['framework', 'summarization', 'text', 'information', 'abstractive', 'abstract', 'representation', 'sentence', 'semantic', 'differs', 'analysis', 'previous'] 

In [12]:
def combine_multi_word_keyword(potential_keywords, filtered_tokens):
    relation = [t for t in filtered_tokens if t[1] in potential_keywords]
    keywords = []
    i = 0
    while i < len(relation):
        idx = relation[i][0]
        keyword = relation[i][1]
        flag = True
        j = i + 1
        while flag and j < len(relation):
            next_idx = relation[j][0]
            if next_idx == idx + 1:
                keyword += ' ' + relation[j][1]
                idx = next_idx
                j += 1
            else:
                flag = False
        i = j
        if keyword not in keywords:
            print (keyword)
            keywords.append(keyword)
    return keywords

In [13]:
final_keywords = combine_multi_word_keyword(potential_keywords, tokens)
final_keywords

deep learning
many
representation
results
domains
designs
natural
processing
brief description
deep
many deep learning models
text
models
understanding
past


['deep learning',
 'many',
 'representation',
 'results',
 'domains',
 'designs',
 'natural',
 'processing',
 'brief description',
 'deep',
 'many deep learning models',
 'text',
 'models',
 'understanding',
 'past']