#### Download GloVe from below:
#### http://nlp.stanford.edu/data/glove.6B.zip 

In [29]:
import pandas as pd
import numpy as np

import spacy
import string
import networkx as nx
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
nlp = spacy.load('en')

In [30]:
data = pd.read_csv('D:/DataScienceCollection/GloVe/Data/tennis_articles_v4.csv')

In [31]:
data['article_text']

0    Maria Sharapova has basically no friends as te...
1    BASEL, Switzerland (AP), Roger Federer advance...
2    Roger Federer has revealed that organisers of ...
3    Kei Nishikori will try to end his long losing ...
4    Federer, 37, first broke through on tour over ...
5    Nadal has not played tennis since he was force...
6    Tennis giveth, and tennis taketh away. The end...
7    Federer won the Swiss Indoors last week by bea...
Name: article_text, dtype: object

#### For now, let's work on Maria Sharapova's Text Summary. i.e, index 0

In [32]:
row = data['article_text'][0]
doc = nlp(row)
sentences=[sent.string.strip() for sent in doc.sents]
print(sentences)


['Maria Sharapova has basically no friends as tennis players on the WTA Tour.', "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.", 'I think everyone knows this is my job here.', "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.", "So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.", "I'm a pretty competitive girl.", "I say my hellos, but I'm not sending any players flowers as well.", "Uhm, I'm not really friendly or close to many players.", "I have not a lot of friends away from the courts.'", 'When she said she is not really close to a lot of players, is that something strategic that she is doing?', "Is it different on the men's tour than the women's tour? '", 'No, not at all.', "I

#### Now, we have the list of sentences, let's make this above list of sentences as list of 'cleaner sentences'. When said 'cleaner' it means removal of STOP WORDs, removal of Punctuations, etc.

In [33]:
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

data = []
dataAfterStopwordsFilter =[]
dataAfterPunctuationsFilter =[]
dataAfterLowerCase =[]
refinedList = []

for sentence in sentences:
    docx = nlp(sentence)
    for token in docx:
        data.append(token.text)
    for token in data:    
        if token != stopwords:
            dataAfterStopwordsFilter.append(token)
    for token in dataAfterStopwordsFilter:
        if token not in punctuations:
            dataAfterPunctuationsFilter.append(token)
    for token in dataAfterPunctuationsFilter:
        dataAfterLowerCase.append(token.lower().strip())    
    #Appending Clean Sentences in a list.
    refinedList.append(dataAfterLowerCase)          
    data = []
    dataAfterStopwordsFilter =[]
    dataAfterPunctuationsFilter =[]
    dataAfterLowerCase =[]

In [34]:
refinedList

['maria',
 'sharapova',
 'have',
 'basically',
 'no',
 'friend',
 'as',
 'tennis',
 'player',
 'on',
 'the',
 'wta',
 'tour',
 'the',
 'russian',
 'player',
 'have',
 'no',
 'problem',
 'in',
 'openly',
 'speak',
 'about',
 'and',
 'in',
 'a',
 'recent',
 'interview',
 'say',
 'do',
 'not',
 'really',
 'hide',
 'any',
 'feeling',
 'too',
 'much',
 'think',
 'everyone',
 'know',
 'this',
 'be',
 'job',
 'here',
 'when',
 'be',
 'on',
 'the',
 'court',
 'or',
 'when',
 'be',
 'on',
 'the',
 'court',
 'playing',
 'be',
 'a',
 'competitor',
 'and',
 'want',
 'to',
 'beat',
 'every',
 'single',
 'person',
 'whether',
 'be',
 'in',
 'the',
 'locker',
 'room',
 'or',
 'across',
 'the',
 'net',
 'so',
 'be',
 'not',
 'the',
 'one',
 'to',
 'strike',
 'up',
 'a',
 'conversation',
 'about',
 'the',
 'weather',
 'and',
 'know',
 'that',
 'in',
 'the',
 'next',
 'few',
 'minute',
 'have',
 'to',
 'go',
 'and',
 'try',
 'to',
 'win',
 'a',
 'tennis',
 'match',
 'be',
 'a',
 'pretty',
 'competitive'

In [None]:
listOfCleanSentences = []

for sentence in refinedList:
    listOfCleanSentences.append(' '.join(sentence))
    
listOfCleanSentences  

#### Introducing GloVe

In [45]:
# Extract word vectors 
word_embeddings = {} 
f = open('D:/DataScienceCollection/GloVe/glove.6B.100d.txt', encoding='utf-8') 
for line in f: 
    values = line.split() 
    word = values[0] 
    coefs = np.asarray(values[1:], dtype='float32')   
    word_embeddings[word] = coefs 
f.close()

In [46]:
len(word_embeddings)

400000

In [47]:
sentence_vectors = []
for i in cleanSentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [48]:
len(sentence_vectors)

274

#### The next step is to find similarities among the sentences. We will use cosine similarity to find similarity between a pair of sentences. Let's create an empty similarity matrix for this task and populate it with cosine similarities of the sentences.

In [49]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [50]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [51]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [52]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [53]:
# Specify number of sentences to form the summary
sn = 10

# Generate summary
for i in range(sn):
    print(ranked_sentences[i][1])

I say my hellos, but I'm not sending any players flowers as well.
Is it different on the men's tour than the women's tour? '
, you're a tennis player,
So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
I think everyone just thinks because we're tennis players we should be the greatest of friends.
I think everyone knows this is my job here.
I have friends that have completely different jobs and interests, and I've met them in very different parts of my life.
When she said she is not really close to a lot of players, is that something strategic that she is doing?
There are so many other things that we're interested in, that we do.'
I think every person has different interests.
