In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm

pd.options.display.max_colwidth = 200

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:5]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt') # one time execution
nltk.download('stopwords')# one time execution

In [None]:
# Read the CSV file
df = pd.read_csv('/kaggle/input/nlp-specialization-data/tennis_articles_v4.csv')
print(df.shape)
df.head(3)

In [None]:
# split the the text in the articles into sentences
sentences = []
for s in df['article_text']:
    sentences.append(sent_tokenize(s))  

In [None]:
# flatten the list
sentences = [y for x in sentences for y in x]

In [None]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [None]:
stop_words = stopwords.words('english')

In [None]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [None]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [None]:
print(len(clean_sentences))
clean_sentences[:15]

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = '/kaggle/input/glove6b/glove.6B.100d.txt'

In [None]:
# Extract word vectors
word_embeddings = {}
f = open(glove_input_file, encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [None]:
len(word_embeddings)

In [None]:
word_embeddings['the'].shape

In [None]:
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [None]:
len(sentence_vectors)

The next step is to find similarities among the sentences. We will use cosine similarity to find similarity between a pair of sentences. Let's create an empty similarity matrix for this task and populate it with cosine similarities of the sentences.

In [None]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])
sim_mat.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

print(sim_mat.shape)
sim_mat[:5,:5]  

In [None]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [None]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [None]:
# Specify number of sentences to form the summary
sn = 10

# Generate summary
for i in range(sn):
    print(ranked_sentences[i][1])

### Further Readings : 
* Text summarization using TextRank in NLP - https://medium.com/data-science-in-your-pocket/text-summarization-using-textrank-in-nlp-4bce52c5b390
* Find the original article here https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/
* link for Cosine Similarity explanation
https://medium.com/datadriveninvestor/cosine-similarity-cosine-distance-6571387f9bf8