## Text Rank Implementation

In [408]:
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import stanza

import string

In [409]:
word_id_reverse = {}
word_id = {}
word_pos = {}

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')

INFO:stanza:Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


In [414]:
lemmatizer = WordNetLemmatizer()

def extract_nn_jj (text):
    global word_id_reverse , word_id, word_pos
    word_id_reverse = {}
    word_id = {}
    word_pos = {}
    stanza_tags = nlp(text.lower())
    text = word_tokenize(text.lower())
    tags = []
    for sentence in stanza_tags.sentences:
        for word in sentence.words:
            tags.append((word.text, word.xpos))
    tags = [(lemmatizer.lemmatize(tag[0]), tag[1]) for tag in tags]
    
    i = 0
    j = 0
    
    for word, tag in tags:
        if tag[:2] == 'NN' or tag[:2] == 'JJ':
            if not word in word_id:
                word_id[word] = j
                word_id_reverse[j] = word
                j += 1
                
            if word not in word_pos:
                word_pos[word] = [i]
            else:
                word_pos[word].append(i)
            
        i += 1
    
    
    adj = [[] for i in range(j)]
    N = 2
    
    for root, root_pos in word_pos.items():
        for neighbor, neighbor_pos in word_pos.items():
            if neighbor != root:
                c_N = float('inf')
                
                for k in root_pos:
                    for l in neighbor_pos:
                        c_N = min(c_N, abs(k - l))
                
                if c_N <= N:
                    adj[word_id[root]].append(word_id[neighbor])
    
    return adj
            
connectivity_graph = extract_nn_jj(open('abstract1.txt').read())            

In [415]:
def text_rank_algorithm (adj):
    d = 0.85
    score = [1 for i in range(len(adj))]
    
    for i in range(100):
        score_change = 0
        
        for j in range(len(adj)):
            current_score = 0
            current_node = j
            neighbors = adj[current_node]
            for neighbor in neighbors:
                current_score += (1/len(adj[neighbor])) * score[neighbor]
            current_score *= d
            current_score = (1 - d) + current_score
            
            score_change += abs(score[current_node] - current_score)
            score[current_node] = current_score
        
        if score_change < 0.0001:
            break
    return score
    

scores = text_rank_algorithm(connectivity_graph)
scores = sorted([(scores[i], i) for i in range(len(scores))], reverse=True)

In [416]:
for score in scores:
    print(score[0], word_id_reverse[score[1]])

1.731997092734356 linear
1.586675611522248 set
1.459461820074 bound
1.3771093780147163 system
1.315907419875756 equation
1.2094089498864729 minimal
1.090769226788456 inequations
0.9864185075099613 strict
0.9429702642537734 criterion
0.9409471201765731 compatibility
0.9248652663028935 number
0.8908913666129417 diophantine
0.8867813719728289 algorithm
0.8802381072097206 natural
0.8695479522229529 construction
0.8640521357199399 solution
0.8298344365829784 generating
0.7702716588670587 upper
0.77027127353145 component
0.6135769213850938 nonstrict
0.540180990437503 type
0.5180493822060507 constraint


### debugging

In [403]:
def show_neighbors (word):
    for i in connectivity_graph[word_id[word]]:
        print(word_id_reverse[i])
        
show_neighbors('solutions')

set
algorithms
sets
