In [65]:
import re
import spacy
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx


In [38]:
nlp = spacy.load('en_core_web_sm')
stop_words = stopwords.words('english')

In [3]:
data_df = pd.read_csv("data/tennis_articles_v4.csv")
data_df.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [4]:
data_df.shape

(8, 3)

In [75]:
# split the the text in the articles into sentences
sentences = []
sentences_ = []
for para in data_df['article_text']:
    document = nlp(para)
    sentences.append([sent for sent in document.sents])
    sentences_.append([sent for sent in document.sents])


In [25]:
sentences

[[Maria Sharapova has basically no friends as tennis players on the WTA Tour.,
  The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.,
  I think everyone knows this is my job here.,
  When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.,
  So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.,
  I'm a pretty competitive girl.,
  I say my hellos, but I'm not sending any players flowers as well.,
  Uhm, I'm not really friendly or close to many players.,
  I have not a lot of friends away from the courts.',
  When she said she is not really close to a lot of players, is that something strategic that she is doing?,
  Is it different on the men's tour than the women's tour? ',
  No, not at all.,
  I

In [26]:
# flatten the list
sentences = [str(y) for x in sentences for y in x]

In [40]:
def remove_punct(string):
    res = re.sub(r'[^a-zA-Z]', ' ', string)
    res = " ".join([i for i in res.split() if len(i)>0])
    return res


# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [69]:
# Remove punctuation and make alphabets lowercase
clean_sentences = [remove_punct(s.lower()) for s in sentences]
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
# clean_sentences = [i for i in clean_sentences if len(i.split()) > 5]
clean_sentences

['maria sharapova basically friends tennis players wta tour',
 'russian player problems openly speaking recent interview said really hide feelings much',
 'think everyone knows job',
 'courts court playing competitor want beat every single person whether locker room across net',
 'one strike conversation weather know next minutes go try win tennis match',
 'pretty competitive girl',
 'say hellos sending players flowers well',
 'uhm really friendly close many players',
 'lot friends away courts',
 'said really close lot players something strategic',
 'different men tour women tour',
 '',
 'think sport mean friends everyone categorized tennis player going get along tennis players',
 'think every person different interests',
 'friends completely different jobs interests met different parts life',
 'think everyone thinks tennis players greatest friends',
 'ultimately tennis small part',
 'many things interested',
 'basel switzerland ap roger federer advanced th swiss indoors final career b

In [46]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip

In [53]:
# Extract word vectors
word_embeddings = {}
f = open('model/glove.6B.50d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [70]:
# Vectorization of sentences
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((50,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((50,))
    sentence_vectors.append(v)

In [71]:
len(sentence_vectors)

121

In [72]:
# similarity matrix
sim_mat = np.zeros([len(sentence_vectors), len(sentence_vectors)])
sim_mat.shape

(121, 121)

In [73]:
for i in range(len(clean_sentences)):
    for j in range(len(clean_sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,50), sentence_vectors[j].reshape(1,50))[0,0]


In [76]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences_)), reverse=True)
ranked_sentences

[(0.00905397859227416,
  [Federer, 37, first broke through on tour over two decades ago and he has since gone on to enjoy a glittering career.,
   The 20-time Grand Slam winner is chasing his 99th ATP title at the Swiss Indoors this week and he faces Jan-Lennard Struff in the second round on Thursday (6pm BST).,
   Davenport enjoyed most of her success in the late 1990s and her third and final major tournament win came at the 2000 Australian Open.,
   But she claims the mentality of professional tennis players slowly began to change after the new millennium.,
   "It seems pretty friendly right now," said Davenport.,
   "I think there is a really nice environment and a great atmosphere, especially between some of the veteran players helping some of the younger players out.,
   "It's a very pleasant atmosphere, I'd have to say, around the locker rooms.,
   "I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not nec

In [77]:
# Specify number of sentences to form the summary
sn = 10

# Generate summary
for i in range(sn):
    print(ranked_sentences[i][1])
    

[Federer, 37, first broke through on tour over two decades ago and he has since gone on to enjoy a glittering career., The 20-time Grand Slam winner is chasing his 99th ATP title at the Swiss Indoors this week and he faces Jan-Lennard Struff in the second round on Thursday (6pm BST)., Davenport enjoyed most of her success in the late 1990s and her third and final major tournament win came at the 2000 Australian Open., But she claims the mentality of professional tennis players slowly began to change after the new millennium., "It seems pretty friendly right now," said Davenport., "I think there is a really nice environment and a great atmosphere, especially between some of the veteran players helping some of the younger players out., "It's a very pleasant atmosphere, I'd have to say, around the locker rooms., "I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments. ", And even t

IndexError: list index out of range