#Libraries

In [532]:
import spacy
import time
import pytextrank
import networkx as nx
import math
import operator
import nltk # Used for stopwords and punctuation removal
nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [533]:
!python3 -m spacy download en_core_web_sm

2021-05-05 17:49:28.446475: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [748]:
nlp=spacy.load('en_core_web_sm')

# Text Rank Algorithm

In [987]:
tic=time.time() # We start the time
#Input paragraph
text="""Lion was roaring under the night of full moon bloom."""
punctuations="?:!.,;"
# tokenize everything
sentence_words = nltk.word_tokenize(text)
str2=''
for word in sentence_words:
    if word not in punctuations and  word not in stopwords.words('english'):
        str2=str2+' '+word
# tock=time.time()
text=str2
#print(str2)
# print('\n',str(tock-tick))

In [988]:
# Here we will add the text required on which NLP will take place
# text = '''Once upon a time, a lion is roaring.'''

In [989]:
#nlp.add_pipe("textrank") # We add a pipeline at the start of every run

In [990]:
doc=nlp(text) #We reinitialise the model everytime
#doc._.phrases 

**Direct method of solving**

In [991]:
# for phrase in doc._.phrases:
#     print(phrase.text,phrase.rank)
#     print(phrase.rank, phrase.count)
    #print(phrase.chunks)

**Alternate way of doing**

In [992]:

# from icecream import ic # Importing the icecream library as an alternate for printing

# for sent in doc.sents:
#     ic(sent.start, sent.end) # This will give the start and stop of sentences 
# for chunk in doc.noun_chunks:
#     ic(chunk.text) #Here we will get all the possible words from the para

In [993]:
#*
#Here we will create the graph by making nodes as a tree
def increment_edge (graph, node0, node1):
    # ic(node0, node1) # Here we print the two nodes of the incoming digraph

    if graph.has_edge(node0, node1):
        graph[node0][node1]["weight"] += 1.0 # Here we form an in-memory graph just like a tree
    else:
        graph.add_edge(node0, node1, weight=1.0) # If there are no edges then this gets added

In [994]:
POS_KEPT = ["ADJ", "NOUN", "PROPN", "VERB"]
# Here we construct the graph using the spacy tags
def link_sentence (doc, sent, lemma_graph, seen_lemma):
    visited_tokens = []
    visited_nodes = []

    for i in range(sent.start, sent.end):
        token = doc[i] # Here we store each word as a token from the doc

        if token.pos_ in POS_KEPT:
            key = (token.lemma_, token.pos_) # We then check the word and its properties

            if key not in seen_lemma:
                seen_lemma[key] = set([token.i])
            else:
                seen_lemma[key].add(token.i)

            node_id = list(seen_lemma.keys()).index(key)

            if not node_id in lemma_graph:
                lemma_graph.add_node(node_id)

            # ic(visited_tokens, visited_nodes)
            # ic(list(range(len(visited_tokens) - 1, -1, -1)))

            for prev_token in range(len(visited_tokens) - 1, -1, -1):
                # ic(prev_token, (token.i - visited_tokens[prev_token]))

                if (token.i - visited_tokens[prev_token]) <= 3:
                    increment_edge(lemma_graph, node_id, visited_nodes[prev_token])
                else:
                    break

            # ic(token.i, token.text, token.lemma_, token.pos_, visited_tokens, visited_nodes)

            visited_tokens.append(token.i)
            visited_nodes.append(node_id)

In [995]:
lemma_graph = nx.Graph()
seen_lemma = {}

for sent in doc.sents:
    link_sentence(doc, sent, lemma_graph, seen_lemma)
    #break # only test one sentence

In [996]:
# ic(seen_lemma)

In [997]:
labels = {}
keys = list(seen_lemma.keys())

for i in range(len(seen_lemma)): # Here we iterate through each sentence to construct their graph
    labels[i] = keys[i][0].lower()

# labels # Here we check the labels

In [998]:
# import matplotlib.pyplot as plt

# fig = plt.figure(figsize=(9, 9))
# pos = nx.spring_layout(lemma_graph)

# nx.draw(lemma_graph, pos=pos, with_labels=False, font_weight="bold")
# nx.draw_networkx_labels(lemma_graph, pos, labels);

In [999]:
ranks = nx.pagerank(lemma_graph) # This variable will store the rank of each node of the graph
# ranks

In [1000]:
# for node_id, rank in sorted(ranks.items(), key=lambda x: x[1], reverse=True):
#     ic(node_id, rank, labels[node_id])

In [1001]:
#*
def collect_phrases (chunk, phrases, counts): # This function will collect all the words and give its rank
    chunk_len = chunk.end - chunk.start
    sq_sum_rank = 0.0
    non_lemma = 0
    compound_key = set([])

    for i in range(chunk.start, chunk.end):
        token = doc[i]
        key = (token.lemma_, token.pos_)

        if key in seen_lemma:
            node_id = list(seen_lemma.keys()).index(key)
            rank = ranks[node_id]
            sq_sum_rank += rank #depending on its frequency and importance its given a rank
            compound_key.add(key)

            # ic(token.lemma_, token.pos_, node_id, rank)
        else:
            non_lemma += 1

    # although the noun chunking is greedy, we discount the ranks using a
    # point estimate based on the number of non-lemma tokens within a phrase
    non_lemma_discount = chunk_len / (chunk_len + (2.0 * non_lemma) + 1.0)

    # use root mean square (RMS) to normalize the contributions of all the tokens
    phrase_rank = math.sqrt(sq_sum_rank / (chunk_len + non_lemma))
    phrase_rank *= non_lemma_discount

    # remove spurious punctuation
    phrase = chunk.text.lower().replace("'", "")

    # create a unique key for the the phrase based on its lemma components
    compound_key = tuple(sorted(list(compound_key)))

    if not compound_key in phrases:
        phrases[compound_key] = set([ (phrase, phrase_rank) ])
        counts[compound_key] = 1
    else:
        phrases[compound_key].add( (phrase, phrase_rank) )
        counts[compound_key] += 1

    # ic(phrase_rank, chunk.text, chunk.start, chunk.end, chunk_len, counts[compound_key])

In [1002]:
phrases = {}
counts = {}

for chunk in doc.noun_chunks:
    collect_phrases(chunk, phrases, counts) # here we collect all the phrases along with their frequency

In [1003]:
for ent in doc.ents:
    collect_phrases(ent, phrases, counts)

In [1004]:
#*
min_phrases = {}

for compound_key, rank_tuples in phrases.items():
    l = list(rank_tuples)
    l.sort(key=operator.itemgetter(1), reverse=True)

    phrase, rank = l[0]
    count = counts[compound_key]

    min_phrases[phrase] = (rank, count)

In [1005]:
# for phrase, (rank, count) in sorted(min_phrases.items(), key=lambda x: x[1][0], reverse=True):
#     ic(phrase, count, rank)

In [1006]:
for node_id, rank in sorted(ranks.items(), key=lambda x: x[1], reverse=True):
  print(labels[node_id], rank)
# print(labels)

night 0.20388201771934134
full 0.2038820177193413
moon 0.16643117534112753
roar 0.1664311753411275
lion 0.12968680693953114
bloom 0.1296868069395311


In [1007]:
toc=time.time()
print(str((toc-tic))+"s")

1.77260422706604s


**Second Alternate**

In [1008]:
# from gensim.summarization import keywords
# print(keywords(text,words=10,split=True,ratio=0.1))

#Stopwords,Punctuation Removal

In [853]:
#Input paragraph
text="""Aman is a very good boy. That is one of the reasons why i am writing the whole paragraph.
 and this is how i will behave!"""
punctuations="?:!.,;"

# tokenize everything
from nltk.tokenize import sent_tokenize, word_tokenize
sentence_words = nltk.word_tokenize(para)

#remove punctuations
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

# To see stopwords ki list uncomment
from nltk.corpus import stopwords
# stopwords.words('english')

# Stopwords removal in list
# Without stemming
final=[]
for word in sentence_words:
  if word not in stopwords.words('english'):
    final.append(word)

text=' '.join(final)
print(text)

Aman good boy That one reasons writing whole paragraph behave
