#### Download GloVe from below:
#### http://nlp.stanford.edu/data/glove.6B.zip 

In [236]:
import pandas as pd
import numpy as np

import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
nlp = spacy.load('en')

In [237]:
data = pd.read_csv('D:/DataScienceCollection/GloVe/Data/tennis_articles_v4.csv')

In [238]:
data['article_text']

0    Maria Sharapova has basically no friends as te...
1    BASEL, Switzerland (AP), Roger Federer advance...
2    Roger Federer has revealed that organisers of ...
3    Kei Nishikori will try to end his long losing ...
4    Federer, 37, first broke through on tour over ...
5    Nadal has not played tennis since he was force...
6    Tennis giveth, and tennis taketh away. The end...
7    Federer won the Swiss Indoors last week by bea...
Name: article_text, dtype: object

#### For now, let's work on Maria Sharapova's Text Summary

In [239]:
row = data['article_text'][0]
doc = nlp(row)
sentences=[sent.string.strip() for sent in doc.sents]
print(sentences)


['Maria Sharapova has basically no friends as tennis players on the WTA Tour.', "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.", 'I think everyone knows this is my job here.', "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.", "So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.", "I'm a pretty competitive girl.", "I say my hellos, but I'm not sending any players flowers as well.", "Uhm, I'm not really friendly or close to many players.", "I have not a lot of friends away from the courts.'", 'When she said she is not really close to a lot of players, is that something strategic that she is doing?', "Is it different on the men's tour than the women's tour? '", 'No, not at all.', "I

In [240]:
dataAfterLemmaFilter = []
dataAfterPronounFilter = []
dataAfterStopwordsFilter =[]
dataAfterPunctuationsFilter =[]
dataAfterNounFilter =[]
cleanSentences =[]

for sent in sentences:
    docx = nlp(sent)
    for token in docx:
        dataAfterLemmaFilter.append(token.lemma_)
    for token in dataAfterLemmaFilter:
        if token != "-PRON-":
            dataAfterPronounFilter.append(token.lower().strip())    
    for token in dataAfterPronounFilter:
        if token != stopwords:
            dataAfterStopwordsFilter.append(token)
    for token in dataAfterStopwordsFilter:
        if token not in punctuations:
            dataAfterPunctuationsFilter.append(token)
    cleanSentences.append(dataAfterPunctuationsFilter)
    dataAfterLemmaFilter = []
    dataAfterPronounFilter = []
    dataAfterStopwordsFilter =[]
    dataAfterPunctuationsFilter =[]
    dataAfterNounFilter =[]
  

In [241]:
cleanSentences


[['maria',
  'sharapova',
  'have',
  'basically',
  'no',
  'friend',
  'as',
  'tennis',
  'player',
  'on',
  'the',
  'wta',
  'tour'],
 ['the',
  'russian',
  'player',
  'have',
  'no',
  'problem',
  'in',
  'openly',
  'speak',
  'about',
  'and',
  'in',
  'a',
  'recent',
  'interview',
  'say',
  'do',
  'not',
  'really',
  'hide',
  'any',
  'feeling',
  'too',
  'much'],
 ['think', 'everyone', 'know', 'this', 'be', 'job', 'here'],
 ['when',
  'be',
  'on',
  'the',
  'court',
  'or',
  'when',
  'be',
  'on',
  'the',
  'court',
  'playing',
  'be',
  'a',
  'competitor',
  'and',
  'want',
  'to',
  'beat',
  'every',
  'single',
  'person',
  'whether',
  'be',
  'in',
  'the',
  'locker',
  'room',
  'or',
  'across',
  'the',
  'net'],
 ['so',
  'be',
  'not',
  'the',
  'one',
  'to',
  'strike',
  'up',
  'a',
  'conversation',
  'about',
  'the',
  'weather',
  'and',
  'know',
  'that',
  'in',
  'the',
  'next',
  'few',
  'minute',
  'have',
  'to',
  'go',
  'a

#### Introducing GloVe

In [223]:
# Extract word vectors 
word_embeddings = {} 
f = open('D:/DataScienceCollection/GloVe/glove.6B.100d.txt', encoding='utf-8') 
for line in f: 
    values = line.split() 
    word = values[0] 
    coefs = np.asarray(values[1:], dtype='float32')   
    word_embeddings[word] = coefs 
f.close()

In [224]:
len(word_embeddings)

400000

#### Converting List of Sentences into DataFrame.

In [225]:
sentencesAsDataFrameSeries = pd.DataFrame(sentences)

In [226]:
sentencesAsDataFrameSeries.columns = ['sentences']

In [227]:
sentencesAsDataFrameSeries

Unnamed: 0,sentences
0,Maria Sharapova has basically no friends as te...
1,The Russian player has no problems in openly s...
2,I think everyone knows this is my job here.
3,When I'm on the courts or when I'm on the cour...
4,So I'm not the one to strike up a conversation...
5,I'm a pretty competitive girl.
6,"I say my hellos, but I'm not sending any playe..."
7,"Uhm, I'm not really friendly or close to many ..."
8,I have not a lot of friends away from the cour...
9,When she said she is not really close to a lot...


#### Cleaning & Basic Computation on each row of Dataframe.

In [228]:
dataAfterLemmaFilter = []
dataAfterPronounFilter = []
dataAfterStopwordsFilter =[]
dataAfterPunctuationsFilter =[]
dataAfterNounFilter =[]

In [229]:
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

for index,row in sentencesAsDataFrameSeries.iterrows():
    docx = nlp(row['sentences'])
    for token in docx:
        dataAfterLemmaFilter.append(token.lemma_)
    for token in dataAfterLemmaFilter:
        if token != "-PRON-":
            dataAfterPronounFilter.append(token.lower().strip())    
    for token in dataAfterPronounFilter:
        if token != stopwords:
            dataAfterStopwordsFilter.append(token)
    for token in dataAfterStopwordsFilter:
        if token not in punctuations:
            dataAfterPunctuationsFilter.append(token)
    sentencesAsDataFrameSeries.at[index, "sentences"] = dataAfterPunctuationsFilter
    dataAfterLemmaFilter = []
    dataAfterPronounFilter = []
    dataAfterStopwordsFilter =[]
    dataAfterPunctuationsFilter =[]
    dataAfterNounFilter =[]


In [234]:
cleanSentencesAsListInDataFrame=sentencesAsDataFrameSeries

In [235]:
cleanSentencesAsListInDataFrame

Unnamed: 0,sentences
0,"[maria, sharapova, have, basically, no, friend..."
1,"[the, russian, player, have, no, problem, in, ..."
2,"[think, everyone, know, this, be, job, here]"
3,"[when, be, on, the, court, or, when, be, on, t..."
4,"[so, be, not, the, one, to, strike, up, a, con..."
5,"[be, a, pretty, competitive, girl]"
6,"[say, hello, but, be, not, send, any, player, ..."
7,"[uhm, be, not, really, friendly, or, close, to..."
8,"[have, not, a, lot, of, friend, away, from, th..."
9,"[when, say, be, not, really, close, to, a, lot..."


In [None]:
for

## Similarity Matrix Preparation

In [214]:
sentence_vectors = []
for i in cleanSentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [215]:
sim_mat = np.zeros([len(sentences), len(sentences)])

In [216]:
from sklearn.metrics.pairwise import cosine_similarity

In [218]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

IndexError: list index out of range

In [None]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [None]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [None]:
# Extract top 10 sentences as the summary
for i in range(10):
    print(ranked_sentences[i][1])