# Text Rank Algorithm

In [None]:
pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx

In [None]:
text='''Santiago is a Shepherd who has a recurring dream which is supposedly prophetic. Inspired on learning this, he undertakes a journey to Egypt to discover the meaning of life and fulfill his destiny. During the course of his travels, he learns of his true purpose and meets many characters, including an “Alchemist”, that teach him valuable lessons about achieving his dreams. Santiago sets his sights on obtaining a certain kind of “treasure” for which he travels to Egypt. The key message is, “when you want something, all the universe conspires in helping you to achieve it.” Towards the final arc, Santiago gets robbed by bandits who end up revealing that the “treasure” he was looking for is buried in the place where his journey began. The end.'''

In [None]:
import nltk
nltk.download('punkt')
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
sentences=sent_tokenize(text)

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
sentences_clean=[re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]
stop_words = stopwords.words('english')
sentence_tokens=[[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]

In [None]:
w2v = Word2Vec(sentence_tokens, vector_size=1, min_count=1, epochs=1000)
sentence_embeddings = [[w2v.wv[word][0] for word in words] for words in sentence_tokens]
max_len = max([len(tokens) for tokens in sentence_tokens])
sentence_embeddings = [np.pad(embedding, (0, max_len - len(embedding)), 'constant') for embedding in sentence_embeddings]


In [None]:
similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
for i,row_embedding in enumerate(sentence_embeddings):
    for j,column_embedding in enumerate(sentence_embeddings):
        similarity_matrix[i][j]=1-spatial.distance.cosine(row_embedding,column_embedding)

In [None]:
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
print(nx_graph)

Graph with 6 nodes and 21 edges


In [None]:
top_sentence={sentence:scores[index] for index,sentence in enumerate(sentences)}
top=dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:4])

In [None]:
for sent in sentences:
    if sent in top.keys():
        print(sent)

Santiago is a Shepherd who has a recurring dream which is supposedly prophetic.
Inspired on learning this, he undertakes a journey to Egypt to discover the meaning of life and fulfill his destiny.
During the course of his travels, he learns of his true purpose and meets many characters, including an “Alchemist”, that teach him valuable lessons about achieving his dreams.
Santiago sets his sights on obtaining a certain kind of “treasure” for which he travels to Egypt.


In [2]:
!pip install git+https://github.com/tagucci/pythonrouge.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/tagucci/pythonrouge.git
  Cloning https://github.com/tagucci/pythonrouge.git to /tmp/pip-req-build-3isl7d5q
  Running command git clone --filter=blob:none --quiet https://github.com/tagucci/pythonrouge.git /tmp/pip-req-build-3isl7d5q
  Resolved https://github.com/tagucci/pythonrouge.git to commit 0f1603dbc089ecb0fb40fdd3a28576e9f3d36769
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pythonrouge
  Building wheel for pythonrouge (setup.py) ... [?25l[?25hdone
  Created wheel for pythonrouge: filename=pythonrouge-0.2-py3-none-any.whl size=285409 sha256=fa9a0ad6bc6ca17b969b5ddddf5e5216e3e51f665470839ae8ee1d54ca6d699a
  Stored in directory: /tmp/pip-ephem-wheel-cache-lehtbx_3/wheels/c1/18/09/c2759ebb9b275e5c59db45dfa35a77dfdbdbde1a733e4427e1
Successfully built pythonrouge
Installing collected packages: pythonrouge

In [3]:
!apt-get install -y cpanminus

!cpanm --force XML::Parser

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libalgorithm-c3-perl libany-uri-escape-perl libb-hooks-endofscope-perl
  libb-hooks-op-check-perl libclass-c3-perl libclass-c3-xs-perl
  libclass-method-modifiers-perl libclass-xsaccessor-perl libcpan-changes-perl
  libcpan-distnameinfo-perl libcpan-meta-check-perl libdata-optlist-perl
  libdata-perl-perl libdata-section-perl libdevel-callchecker-perl
  libdynaloader-functions-perl libencode-locale-perl libexporter-tiny-perl
  libfile-pushd-perl libfile-slurp-perl libgetopt-long-descriptive-perl
  libhtml-parser-perl libhtml-tagset-perl libhttp-date-perl
  libhttp-message-perl libidn11 libimport-into-perl libio-html-perl
  libio-stringy-perl liblist-moreutils-perl liblocal-lib-perl
  liblwp-mediatypes-perl libmodule-build-perl libmodule-cpanfile-perl
  libmodule-implementation-perl libmodule-runtime-perl
  libmodule-signature-perl libmoo

In [4]:
#Model Evaluation for Extractive summarization Text Rank algorithm

from pythonrouge.pythonrouge import Pythonrouge

# system summary(prediction) & reference summary

summary = [[["Santiago is a Shepherd who has a recurring dream which is supposedly prophetic.Inspired on learning this, he undertakes a journey to Egypt to discover the meaning of life and fulfill his destiny.During the course of his travels, he learns of his true purpose and meets many characters, including an “Alchemist”, that teach him valuable lessons about achieving his dreams.Santiago sets his sights on obtaining a certain kind of “treasure” for which he travels to Egypt."]]]
reference = [[["Santiago is a Shepherd who has a recurring dream which is supposedly prophetic.he undertakes a journey to Egypt to discover the meaning of life and fulfill his destiny.During the course of his travels, he learns of his true purpose and meets many characters, including an “Alchemist”, .Santiago sets his sights on obtaining a certain kind of “treasure” for which he travels to Egypt.The key message is, “when you want something, all the universe conspires in helping you to achieve it.”Towards the final arc, Santiago gets robbed by bandits who end up revealing that the “treasure” he was looking for is buried in the place where his journey began."]]]
 # initialize setting of ROUGE to evaluate ROUGE-1, 2, W and SU4

rouge = Pythonrouge(summary_file_exist=False,summary=summary, reference=reference,
                    n_gram=2, ROUGE_SU4=True, ROUGE_L=True,ROUGE_W=True,ROUGE_W_Weight=1.2,
                    recall_only=False, stemming=True, stopwords=True,
                    word_level=True, length_limit=True, length=50,
                    use_cf=False, cf=95, scoring_formula='average',
                    resampling=True, samples=1000, favor=True, p=0.5)

score = rouge.calc_score()

print(score)

{'ROUGE-1-R': 0.88, 'ROUGE-1-P': 0.91667, 'ROUGE-1-F': 0.89796, 'ROUGE-2-R': 0.83333, 'ROUGE-2-P': 0.86957, 'ROUGE-2-F': 0.85106, 'ROUGE-L-R': 0.88, 'ROUGE-L-P': 0.91667, 'ROUGE-L-F': 0.89796, 'ROUGE-W-1.2-R': 0.46227, 'ROUGE-W-1.2-P': 0.91667, 'ROUGE-W-1.2-F': 0.6146, 'ROUGE-SU4-R': 0.79851, 'ROUGE-SU4-P': 0.83594, 'ROUGE-SU4-F': 0.8168}


#Tennis Articles 

In [None]:
from google.colab import files
uploaded = files.upload()

Saving tennis_articles.csv to tennis_articles.csv


In [None]:
#Importing file
import pandas as pd
import io
 
data = pd.read_csv(io.BytesIO(uploaded['tennis_articles.csv']), encoding='ISO-8859-1')
print(data)

   article_id                                      article_title  \
0           1  I do not have friends in tennis, says Maria Sh...   
1           2  Federer defeats Medvedev to advance to 14th Sw...   
2           3  Tennis: Roger Federer ignored deadline set by ...   
3           4  Nishikori to face off against Anderson in Vien...   
4           5  Roger Federer has made this huge change to ten...   
5           6  Rafael Nadal: World No 1 ARRIVES for Paris Mas...   
6           7  TENNIS.COM PODCAST: POINT DEFENSE, RANKING DRO...   
7           8  Tennis journalists heartbreaking insight on T...   

                                        article_text  \
0  Maria Sharapova has basically no friends as te...   
1  BASEL, Switzerland (AP)  Roger Federer advanc...   
2  Roger Federer has revealed that organisers of ...   
3  Kei Nishikori will try to end his long losing ...   
4  Federer, 37, first broke through on tour over ...   
5  Nadal has not played tennis since he was force..

In [None]:
data.head(10)

Unnamed: 0,article_id,article_title,article_text,source
0,1,"I do not have friends in tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP)  Roger Federer advanc...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,Roger Federer has made this huge change to ten...,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...
5,6,Rafael Nadal: World No 1 ARRIVES for Paris Mas...,Nadal has not played tennis since he was force...,https://www.express.co.uk/sport/tennis/1037119...
6,7,"TENNIS.COM PODCAST: POINT DEFENSE, RANKING DRO...","Tennis giveth, and tennis taketh away. The end...",http://www.tennis.com/pro-game/2018/10/tennisc...
7,8,Tennis journalists heartbreaking insight on T...,I PLAYED golf last week with Todd Reid. He pic...,https://www.foxsports.com.au/tennis/tennis-jou...


In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
data['article_text'][0]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net. So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same

In [None]:
from nltk.tokenize import sent_tokenize
sentences = [] 
for s in data['article_text']: 
  sentences.append(sent_tokenize(s))# flatten list
sentences = [y for x in sentences for y in x]


In [None]:
sentences[:5]

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.",
 "So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match."]

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip 
!unzip glove*.zip

--2023-04-21 09:04:48--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-04-21 09:04:49--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-04-21 09:04:49--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
# Extract word vectors 
word_embeddings = {} 
f = open('glove.6B.100d.txt', encoding='utf-8') 
for line in f: 
    values = line.split() 
    word = values[0] 
    coefs = np.asarray(values[1:], dtype='float32')   
    word_embeddings[word] = coefs 
f.close()

In [None]:
# remove punctuations, numbers and special characters 
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ") # make alphabets lowercase 
clean_sentences = [s.lower() for s in clean_sentences]


  clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ") # make alphabets lowercase


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')


In [None]:
# function to remove stopwords 
def remove_stopwords(sen):     
    sen_new = " ".join([i for i in sen if i not in stop_words])          
    return sen_new# remove stopwords from the sentences 
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [None]:
#Vector Representation of Sentences
# Extract word vectors 
word_embeddings = {} 
f = open('glove.6B.100d.txt', encoding='utf-8') 
for line in f: 
    values = line.split() 
    word = values[0] 
    coefs = np.asarray(values[1:], dtype='float32')    
    word_embeddings[word] = coefs 
f.close()

In [None]:
sentence_vectors = [] 
for i in clean_sentences: 
  if len(i) != 0: 
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in  
        i.split()])/(len(i.split())+0.001) 
  else: 
    v = np.zeros((100,)) 
  sentence_vectors.append(v)

In [None]:
# similarity matrix 
sim_mat = np.zeros([len(sentences), len(sentences)])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
for i in range(len(sentences)): 
  for j in range(len(sentences)): 
    if i != j: 
      sim_mat[i][j] = cosine_similarity (sentence_vectors[i].reshape(1,100),   
                  sentence_vectors[j].reshape(1,100))[0,0]


In [None]:
import networkx as nx 
nx_graph = nx.from_numpy_array(sim_mat) 
scores = nx.pagerank(nx_graph)

In [None]:
ranked_sentences = sorted(((scores[i],s) for i,s in 
                           enumerate(sentences)), reverse=True)# Extract top 10 sentences as the summary 
for i in range(10): 
  print(ranked_sentences[i][1])

I was on a nice trajectorythen, Reid recalled.If I hadnt got sick, I think I could have started pushing towards the second week at the slams and then who knows. Duringa comeback attempt some five years later, Reid added Bernard Tomic and 2018 US Open Federer slayer John Millman to his list of career scalps.
Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.
Currently in ninth place, Nishikori with a win could move to within 125 points of the cut for the eight-man event in London next month.
Exhau