# Implementation of TextRank
(Based on: https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf)

The input text is given below

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [2]:
!pip install hazm

Collecting hazm
[?25l  Downloading https://files.pythonhosted.org/packages/22/13/5a7074bc11d20dbbb46239349ac3f85f7edc148b4cf68e9b8c2f8263830c/hazm-0.7.0-py3-none-any.whl (316kB)
[K     |████████████████████████████████| 317kB 2.8MB/s 
[?25hCollecting libwapiti>=0.2.1; platform_system != "Windows"
[?25l  Downloading https://files.pythonhosted.org/packages/bc/0f/1c9b49bb49821b5856a64ea6fac8d96a619b9f291d1f06999ea98a32c89c/libwapiti-0.2.1.tar.gz (233kB)
[K     |████████████████████████████████| 235kB 8.8MB/s 
[?25hCollecting nltk==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 13.2MB/s 
Building wheels for collected packages: libwapiti, nltk
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp36-cp36m-linux_x86_64.whl size=154544 sha256=282ba85018eadccba8294d363

In [None]:
# Path
PosTaggerModelPath = '/content/drive/My Drive/textrankstuff/resources/POSTagger.model'
StopWordsPath = '/content/drive/My Drive/textrankstuff/resources/STOPWORDS.txt'


In [None]:

from gensim.summarization.syntactic_unit import SyntacticUnit
from gensim.parsing.preprocessing import preprocess_documents
from gensim.utils import tokenize
from six.moves import xrange
import re
import logging
from hazm import *

logger = logging.getLogger('summa.preprocessing.cleaner')

try:
    #from pattern.en import tag
    from hazm import POSTagger
    tagger = POSTagger(model=PosTaggerModelPath)
    logger.info("'pattern' package found; tag filters are available for Persian")
    HAS_PATTERN = True
except ImportError:
    #logger.info("'pattern' package not found; tag filters are not available for English")
    logger.info("'pattern' package not found; tag filters are not available for Persian")
    HAS_PATTERN = False


SEPARATOR = r'@'
RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)  # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE)
UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE)
UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE)

def split_sentences(text):
    return (sent_tokenize(text))

def replace_abbreviations(text):
    return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM])


def undo_replacement(sentence):
    return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])


def replace_with_separator(text, separator, regexs):
    replacement = r"\1" + separator + r"\2"
    result = text
    for regex in regexs:
        result = regex.sub(replacement, result)
    return result


#def get_sentences(text):
#    for match in RE_SENTENCE.finditer(text):
#        yield match.group()

def get_sentences(text):
    te = sent_tokenize(text)
    for each in te:
        yield (each)


def merge_syntactic_units(original_units, filtered_units, tags=None):
    units = []
    for i in xrange(len(original_units)):
        if filtered_units[i] == '':
            continue

        text = original_units[i]
        token = filtered_units[i]

        if tags :
            try:
                tag = tags[i][1]
            except:
                tag = None
        else:
            tag = None

        #tag = tags[i][1] if tags else None
        
        sentence = SyntacticUnit(text, token, tag)
        sentence.index = i

        units.append(sentence)

    return units


def join_words(words, separator=" "):
    return separator.join(words)


def clean_text_by_sentences(text):
    """ Tokenizes a given text into sentences, applying filters and lemmatizing them.
    Returns a SyntacticUnit list. """
    print("1")
    original_sentences = split_sentences(text)
    print("2")
    filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]
    print("3")
    tags = clean_text_by_word(text)
    print("4")
    return merge_syntactic_units(original_sentences, filtered_sentences, tags)


def clean_text_by_word(text, deacc=True):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tagger.tag(original_words) # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return dict((unit.text,unit) for unit in units)


def tokenize_by_word(text):
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    return tokenize(text_without_acronyms, to_lower=True, deacc=True)



In [None]:
import sys
sys.path.append('/content/drive/My Drive')

In [None]:
b = '''
داوری فرایندی است که به موجب آن اختلاف طرفین نسبت به حقوق و تکالیف قانونی خود از طریق انتخاب یک یا چند نفر داور به جای دادگاه حل وفصل گردیده و رای لازم الاجرا صادر می گردد. در فصل اول کلیاتی راجع به مفهوم و مقایسه‌ی داوری با مفاهیم مشابه و اقسام داوری،داوری در مسائل خاص، مبانی حقوقی داوری در حقوق ایران بیان شده است و در فصل دوم  به بررسی ماهیت و شرایط نصب داور و ممنوعیت‌های مرتبط با نصب داور پرداخته‌‌ایم و درفصل سوم به  بررسی آثار نصب داور پرداخته شده است.النهایه به این نتیجه رسیده‌ایم که ماهیت نصب داور بانظریه مختلط تطبیق بیشتری دارد  و در خصوص شرایط نصب داور می‌توان به موارد زیر اشاره کرد: داور‌پذیری، استقلال و بی‌طرفی داور اشاره کرد و در خصوص ممنوعیت نصب داور می‌توان به ممنوعیت‌‌های مرتبط با داور و ممنوعیت‌‌های مرتبط با موضوع اختلاف و همچنین ممنوعیت‌های مرتبط با طرفین اختلاف اشاره کرد و در خصوص آثار نصب داور می‌توان گفت که هم برای طرفین اختلاف و هم برای دادگاه‌ها و هم برای داور تعهداتی را ایجاد می‌کند و از جمله‌ی آن‌ها می‌توان به لازم التباع بودن رأی داور توسط طرفین و اعمال قاعده‌ای امر قضاوت شده اشاره کرد و برای دادگاه‌ها مهم‌ترین اثرآن این است از دادگاه‌ها سلب صلاحیت می‌کند و برای داور الزام به رسیدگی و صدور رأی را ایجاد می‌کند

 '''

import math
import numpy as np
from hazm import *
# from textcleaner import clean_text_by_sentences

In [None]:

def kwextractor(text):
    
	tagger = POSTagger(model=PosTaggerModelPath)
	tagged = tagger.tag(word_tokenize(text))
	lemmatizer = Lemmatizer()
	adjective_tags = ['JJ','JJR','JJS']
	lemmatized_text_0 = []
	for word in tagged:
	    if word[1] in adjective_tags:
	        lemmatized_text_0.append(str(lemmatizer.lemmatize(word[0],pos="a")))
	    else:
	        lemmatized_text_0.append(str(lemmatizer.lemmatize(word[0])))
            
# 	print(lemmatized_text_0)
    
	lemmatized_text = []
	for word in lemmatized_text_0:
		if "#" in word:
			sharpIdx = word.find("#")
			if sharpIdx != 0:
				new = word[:sharpIdx]
			else:
				new = word[sharpIdx+1:]
		else:
			lemmatized_text.append(word)
        
# 	print()
# 	print("-----------------------------------------")
# 	print()
# 	print(lemmatized_text)
    
	POS_tag = tagger.tag(lemmatized_text)
	stopwords = []
	stopword_file = open(StopWordsPath , "r")
	lots_of_stopwords = []
	for line in stopword_file.readlines():
	    lots_of_stopwords.append(str(line.strip()))

	stopwords_plus = []
	stopwords_plus = stopwords + lots_of_stopwords
	stopwords_plus = set(stopwords_plus)
# 	print(stopwords_plus)
    
	processed_text = []
	for word in lemmatized_text:
	    if word not in stopwords_plus:
# 	        print(word)
	        processed_text.append(word)
	vocabulary = list(set(processed_text))
	vocab_len = len(vocabulary)
	weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)
	score = np.zeros((vocab_len),dtype=np.float32)
	window_size = 3
	covered_coocurrences = []
	for i in range(0,vocab_len):
	    score[i]=1
	    for j in range(0,vocab_len):
	        if j==i:
	            weighted_edge[i][j]=0
	        else:
	            for window_start in range(0,(len(processed_text)-window_size)):
	                
	                window_end = window_start+window_size
	                
	                window = processed_text[window_start:window_end]
	                
	                if (vocabulary[i] in window) and (vocabulary[j] in window):
	                    
	                    index_of_i = window_start + window.index(vocabulary[i])
	                    index_of_j = window_start + window.index(vocabulary[j])
	                    if [index_of_i,index_of_j] not in covered_coocurrences:
	                        weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
	                        covered_coocurrences.append([index_of_i,index_of_j])

	inout = np.zeros((vocab_len),dtype=np.float32)

	for i in range(0,vocab_len):
	    for j in range(0,vocab_len):
	        inout[i]+=weighted_edge[i][j]
	MAX_ITERATIONS = 50
	d=0.85
	threshold = 0.0001 #convergence threshold

	for iter in range(0,MAX_ITERATIONS):
	    prev_score = np.copy(score)
	    
	    for i in range(0,vocab_len):
	        
	        summation = 0
	        for j in range(0,vocab_len):
	            if weighted_edge[i][j] != 0:
	                summation += (weighted_edge[i][j]/inout[j])*score[j]
	                
	        score[i] = (1-d) + d*(summation)
	    
	    if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
	        break

	phrases = []

	phrase = " "
	for word in lemmatized_text:
	    
	    if word in stopwords_plus:
	        if phrase!= " ":
	            phrases.append(str(phrase).strip().split())
	        phrase = " "
	    elif word not in stopwords_plus:
	        phrase+=str(word)
	        phrase+=" "

	unique_phrases = []

	for phrase in phrases:
	    if phrase not in unique_phrases:
	        unique_phrases.append(phrase)

	for word in vocabulary:
	    for phrase in unique_phrases:
	        if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
	            unique_phrases.remove([word])     
  
	phrase_scores = []
	keywords = []
	for phrase in unique_phrases:
	    phrase_score=0
	    keyword = ''
	    for word in phrase:
	        keyword += str(word)
	        keyword += " "
	        phrase_score+=score[vocabulary.index(word)]
	    phrase_scores.append(phrase_score)
	    keywords.append(keyword.strip())

	i=0
	words_with_scores = []
	for keyword in keywords:
	    tup = (keyword,phrase_scores[i])
	    words_with_scores.append(tup)
	    i+=1


	sortedKeywords = sorted(words_with_scores, key=lambda x: x[1], reverse=True)
	sortedKeywords = [x[0] for x in sortedKeywords]
  # print("*" * 100)
	print(sortedKeywords)
	# sorted_index = np.argsort(phrase_scores)
	# sorted_keywords = []
	# for idx in range(len())
	# print(len(sorted_index))
	keywords_num = 10
	final_keywords = []
	for word in sortedKeywords:
		if "#" in word:
			sharpIdx = word.find("#")
			if sharpIdx != 0:
				new = word[:sharpIdx]
			else:
				new = word[sharpIdx+1:]
			if new not in stopwords_plus: 
				final_keywords.append(new)
		else:
			final_keywords.append(word)
	# print(len(final_keywords))
	return final_keywords  

# ALL TOGETHER

In [None]:
extracted_keywordss = kwextractor(b)
print("==================PREDICTED=================")
for wo in extracted_keywordss:
    print(wo)

real = """
حل اختلاف,داور,ان‍ت‍ص‍اب‌,صلاحیت حرفه‌ای,بی‌طرفی,استقلال قضایی,اصول حقوقی
"""

print("==================REAL====================")
# real = real.split(",")
# for w in real:
#     print(w)
# print(real)

['خصوص ممنوعیت نصب داور می\u200cتوان', 'خصوص شرایط نصب داور می\u200cتوان', 'خصوص آثار نصب داور می\u200cتوان', 'ماهیت نصب داور بانظریه مختلط تطبیق', 'بررسی آثار نصب داور', 'شرایط نصب داور', 'نصب داور پرداخته', 'بی\u200cطرف داور اشاره', 'مبانی حقوق داور', 'رأی داور', 'مقایسه داور', 'داور الزام', 'نفر داور', 'داور تعهدات', 'اقسام داور', 'داور فرایند', 'طرفین اختلاف اشاره', 'اختلاف طرفین نسبت', 'دادگاه حل وفصل گردیده', 'طرفین اختلاف', 'دادگاه سلب صلاحیت', 'دادگاه مهم اثرآن', 'ممنوعیت مرتبط', 'حقوق ایران بیان', 'لازم الاجرا صادر', 'ممنوعیت\u200c\u200cهای مرتبط', 'اعمال قاعده امر قضاوت', 'موضوع اختلاف', 'بررسی ماهیت', 'لازم التباع', 'صدور رأی', 'کلیات راجع', 'تکالیف قانون', 'مفاهیم مشابه', 'فصل', 'مسائل خاص', 'مفهوم', 'رسیدگی', 'انتخاب', 'ایجاد', 'جمله', 'استقلال', 'داور\u200cپذیری', 'درفصل', 'نتیجه', 'موجب', 'النهایه']
خصوص ممنوعیت نصب داور می‌توان
خصوص شرایط نصب داور می‌توان
خصوص آثار نصب داور می‌توان
ماهیت نصب داور بانظریه مختلط تطبیق
بررسی آثار نصب داور
شرایط نصب داور
نصب داور پرداخته
بی

In [None]:
import json
json_file_name = "data1.json"
result_file = "text_rank.json"
result_list = []
result_dict = {}
# print("111")

# with open(json_file_name) as json_file, open(result_file, 'w') as resfile:
#     data = json.load(json_file)
#     x = data['articles'][16]['extract']
#     kwextractor(x)






with open(json_file_name) as json_file, open(result_file, 'w') as resfile:
    data = json.load(json_file)
    for ii,article in enumerate(data['articles'][:50]):
        result_dict['#'] = article['#']
        result_dict['extract'] = article['extract']
        result_dict['FirstLevelSubject'] = article['firstlevelsubject']
        result_dict['ID'] = article['id']
        print(f"{ii} : {result_dict['ID']}")
        result_dict['Keywords'] = article['keywords']
        result_dict['TextRank-Keywords'] = " , ".join(kwextractor(article['extract']))
        result_dict['MAJOR'] = article['major']
        result_dict['SUBJECT'] = article['subject']
        result_dict['TITLE'] = article['title']
        result_list.append(result_dict)
    result_list = {'articles': result_list}
    j = json.dumps(result_list)
    resfile.write(json.dumps(result_list, indent=4, sort_keys=True, ensure_ascii=False))
    

# ---------------------
# STEP BY STEP
# ---------------------

### POS Tagging For Lemmatization

NLTK is again used for <b>POS tagging</b> the input text so that the words can be lemmatized based on their POS tags.

Description of POS tags: 


http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [None]:
#nltk.download('averaged_perceptron_tagger')
  

tagger = POSTagger(model='resources/POSTagger.model')
tagged = tagger.tag(word_tokenize(text))

print ("Tokenized Text with POS tags: \n")

### Lemmatization

The tokenized text (mainly the nouns and adjectives) is normalized by <b>lemmatization</b>.
In lemmatization different grammatical counterparts of a word will be replaced by single
basic lemma. For example, 'glasses' may be replaced by 'glass'. 

Details about lemmatization: 
    
https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

In [None]:
#nltk.download('wordnet')

# wordnet_lemmatizer = WordNetLemmatizer()
lemmatizer = Lemmatizer()

adjective_tags = ['JJ','JJR','JJS']

lemmatized_text = []

for word in tagged:
    if word[1] in adjective_tags:
        lemmatized_text.append(str(lemmatizer.lemmatize(word[0],pos="a")))
    else:
        lemmatized_text.append(str(lemmatizer.lemmatize(word[0]))) #default POS = noun
        
print ("Text tokens after lemmatization of adjectives and nouns: \n")
print (lemmatized_text)

### POS tagging for Filtering

The <b>lemmatized text</b> is <b>POS tagged</b> here. The tags will be used for filtering later on.

In [None]:
POS_tag = tagger.tag(lemmatized_text)

print ("Lemmatized text with POS tags: \n")
print (POS_tag)

## POS Based Filtering

Any word from the lemmatized text, which isn't a noun, adjective, or gerund (or a 'foreign word'), is here
considered as a <b>stopword</b> (non-content). This is based on the assumption that usually keywords are noun,
adjectives or gerunds. 

Punctuations are added to the stopword list too.

In [None]:
# stopwords = []

# wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 

# for word in POS_tag:
#     if word[1] not in wanted_POS:
#         stopwords.append(word[0])

# punctuations = list(str(string.punctuation))

# stopwords = stopwords + punctuations

### Complete stopword generation

Even if we remove the aforementioned stopwords, still some extremely common nouns, adjectives or gerunds may
remain which are very bad candidates for being keywords (or part of it). 

An external file constituting a long list of stopwords is loaded and all the words are added with the previous
stopwords to create the final list 'stopwords-plus' which is then converted into a set. 

(Source of stopwords data: https://www.ranks.nl/stopwords)

Stopwords-plus constitute the sum total of all stopwords and potential phrase-delimiters. 

(The contents of this set will be later used to partition the lemmatized text into n-gram phrases. But, for now, I will simply remove the stopwords, and work with a 'bag-of-words' approach. I will be developing the graph using unigram texts as vertices)

In [None]:
stopwords = []
stopword_file = open("resources/STOPWORDS.txt", "r")
#Source = https://www.ranks.nl/stopwords

lots_of_stopwords = []

for line in stopword_file.readlines():
    lots_of_stopwords.append(str(line.strip()))

stopwords_plus = []
stopwords_plus = stopwords + lots_of_stopwords
stopwords_plus = set(stopwords_plus)
stopwords_plus
#Stopwords_plus contain total set of all stopwords

### Removing Stopwords 

Removing stopwords from lemmatized_text. 
Processeced_text condtains the result.

In [None]:
processed_text = []
for word in lemmatized_text:
#     print(word)
    if word not in stopwords_plus:
        processed_text.append(word)

for w in processed_text:
    print(w)
# print (processed_text)

## Vocabulary Creation

Vocabulary will only contain unique words from processed_text.

In [None]:
vocabulary = list(set(processed_text))
for w in vocabulary:
    print(w)

### Building Graph

TextRank is a graph based model, and thus it requires us to build a graph. Each words in the vocabulary will serve as a vertex for graph. The words will be represented in the vertices by their index in vocabulary list.  

The weighted_edge matrix contains the information of edge connections among all vertices.
I am building wieghted undirected edges.

weighted_edge[i][j] contains the weight of the connecting edge between the word vertex represented by vocabulary index i and the word vertex represented by vocabulary j.

If weighted_edge[i][j] is zero, it means no edge connection is present between the words represented by index i and j.

There is a connection between the words (and thus between i and j which represents them) if the words co-occur within a window of a specified 'window_size' in the processed_text.

The value of the weighted_edge[i][j] is increased by (1/(distance between positions of words currently represented by i and j)) for every connection discovered between the same words in different locations of the text. 

The covered_coocurrences list (which is contain the list of pairs of absolute positions in processed_text of the words whose coocurrence at that location is already checked) is managed so that the same two words located in the same positions in processed_text are not repetitively counted while sliding the window one text unit at a time.

The score of all vertices are intialized to one. 

Self-connections are not considered, so weighted_edge[i][i] will be zero.

In [None]:
import numpy as np
import math
vocab_len = len(vocabulary)

weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

score = np.zeros((vocab_len),dtype=np.float32)
window_size = 3
covered_coocurrences = []

for i in range(0,vocab_len):
    score[i]=1
    for j in range(0,vocab_len):
        if j==i:
            weighted_edge[i][j]=0
        else:
            for window_start in range(0,(len(processed_text)-window_size)):
                
                window_end = window_start+window_size
                
                window = processed_text[window_start:window_end]
                
                if (vocabulary[i] in window) and (vocabulary[j] in window):
                    
                    index_of_i = window_start + window.index(vocabulary[i])
                    index_of_j = window_start + window.index(vocabulary[j])
                    
                    # index_of_x is the absolute position of the xth term in the window 
                    # (counting from 0) 
                    # in the processed_text
                      
                    if [index_of_i,index_of_j] not in covered_coocurrences:
                        weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
                        covered_coocurrences.append([index_of_i,index_of_j])


### Calculating weighted summation of connections of a vertex

inout[i] will contain the sum of all the undirected connections\edges associated withe the vertex represented by i.

In [None]:
inout = np.zeros((vocab_len),dtype=np.float32)

for i in range(0,vocab_len):
    for j in range(0,vocab_len):
        inout[i]+=weighted_edge[i][j]

### Scoring Vertices

The formula used for scoring a vertex represented by i is:

score[i] = (1-d) + d x [ Summation(j) ( (weighted_edge[i][j]/inout[j]) x score[j] ) ] where j belongs to the list of vertieces that has a connection with i. 

d is the damping factor.

The score is iteratively updated until convergence. 

In [None]:
MAX_ITERATIONS = 50
d=0.85
threshold = 0.0001 #convergence threshold

for iter in range(0,MAX_ITERATIONS):
    prev_score = np.copy(score)
    
    for i in range(0,vocab_len):
        
        summation = 0
        for j in range(0,vocab_len):
            if weighted_edge[i][j] != 0:
                summation += (weighted_edge[i][j]/inout[j])*score[j]
                
        score[i] = (1-d) + d*(summation)
    
    if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
        print("Converging at iteration "+str(iter)+"....")
        break


In [None]:
for i in range(0,vocab_len):
    print("Score of "+vocabulary[i]+": "+str(score[i]))

### Phrase Partiotioning

Paritioning lemmatized_text into phrases using the stopwords in it as delimeters.
The phrases are also candidates for keyphrases to be extracted. 

In [None]:
phrases = []

phrase = " "
for word in lemmatized_text:
    
    if word in stopwords_plus:
        if phrase!= " ":
            phrases.append(str(phrase).strip().split())
        phrase = " "
    elif word not in stopwords_plus:
        phrase+=str(word)
        phrase+=" "

print("Partitioned Phrases (Candidate Keyphrases): \n")
print(phrases)

### Create a list of unique phrases.

Repeating phrases\keyphrase candidates has no purpose here, anymore. 

In [None]:
unique_phrases = []

for phrase in phrases:
    if phrase not in unique_phrases:
        unique_phrases.append(phrase)

print("Unique Phrases (Candidate Keyphrases): \n")
print(unique_phrases)

### Thinning the list of candidate-keyphrases.

Removing single word keyphrases-candidates that are present multi-word alternatives. 

In [None]:
for word in vocabulary:
    #print word
    for phrase in unique_phrases:
        if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
            #if len(phrase)>1 then the current phrase is multi-worded.
            #if the word in vocabulary is present in unique_phrases as a single-word-phrase
            # and at the same time present as a word within a multi-worded phrase,
            # then I will remove the single-word-phrase from the list.
            unique_phrases.remove([word])
            
print("Thinned Unique Phrases (Candidate Keyphrases): \n")
print(unique_phrases)    

### Scoring Keyphrases

Scoring the phrases (candidate keyphrases) and building up a list of keyphrases\keywords
by listing untokenized versions of tokenized phrases\candidate-keyphrases.
Phrases are scored by adding the score of their members (words\text-units that were ranked by the graph algorithm)


In [None]:
phrase_scores = []
keywords = []
for phrase in unique_phrases:
    phrase_score=0
    keyword = ''
    for word in phrase:
        keyword += str(word)
        keyword += " "
        phrase_score+=score[vocabulary.index(word)]
    phrase_scores.append(phrase_score)
    keywords.append(keyword.strip())

i=0
words_with_scores = []
for keyword in keywords:
#     print(words_with_scores)
    tup = (keyword,phrase_scores[i])
    words_with_scores.append(tup)
#     print ("Keyword: '"+str(keyword)+"', Score: "+str(phrase_scores[i]))
    i+=1
 
# print(len(words_with_scores))
# print(word_with_scores)
for i,ws in enumerate(words_with_scores):
    print(f"{i}",ws)

### Ranking Keyphrases

Ranking keyphrases based on their calculated scores. Displaying top keywords_num no. of keyphrases.

In [None]:
sorted_index = np.flip(np.argsort(phrase_scores),0)

print(phrase_scores)

keywords_num = 10

print("Keywords:\n")

for w in keywords:
    print(w)
# for i in range(0,keywords_num):
#     try:
#         print(str(i),str(keywords[sorted_index[i]])+", ", end=' ')
#     except:
#         continue

# Input:

Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.

# Extracted Keywords:

* minimal supporting set,  
* minimal generating set,  
* minimal set,  
* linear diophantine equation,  
* nonstrict inequations,  
* strict inequations,  
* system,  
* linear constraint,  
* solution,  
* upper bound, 
