### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
import re

## Reading article from Wikipedia

In [None]:
from bs4 import BeautifulSoup
import requests

subject = input("Enter the wikipedia topic to be summarised")
base_url = "https://en.wikipedia.org/wiki/"+subject
page = requests.get(base_url)

soup = BeautifulSoup(page.content,'html.parser')
paragraphs = soup.find_all('p')

content=""
for paragraph in paragraphs:
    content+=paragraph.text

content

## Sentence Tokenization

In [None]:
from nltk.tokenize import word_tokenize,sent_tokenize
sentences = sent_tokenize(content)
sentences


In [None]:
type(sentences)

## Importing Word Embeddings

In [None]:
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    word_embeddings[word] = coefs
f.close()

len(word_embeddings)

In [None]:
clean_sentences = []
for sentence in sentences:   
    clean_sentences.append(re.sub("[^a-zA-Z]", " ", sentence))
clean_sentences = [s.lower() for s in clean_sentences]
clean_sentences


In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

## Removing Stop Words

In [None]:
def remove_stopwords(sentence):
    sen_new = " ".join([i for i in sentence if i not in stop_words])
    return sen_new
clean_sentences = [remove_stopwords(sent.split()) for sent in clean_sentences]
clean_sentences

## Visualizing Word Frequency

In [None]:
word_dict = {}
for i in clean_sentences:
    if len(i) != 0:
        for w in i.split():
            if w not in word_dict:
                word_dict[w] = 1
            else:
                word_dict[w] += 1
word_counts = pd.DataFrame({'words':list(word_dict.keys()), 'counts':list(word_dict.values())})
word_counts = word_counts.set_index('words')

In [None]:
top_ten = word_counts.nlargest(20, ['counts'])

### Horizontal Bar graph

In [None]:
ax = top_ten.plot.barh(x = None, y = 'counts', rot = 0)

### Word Cloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

worldcloud = WordCloud(width = 900, height = 500, max_words = 1000, relative_scaling = 1, normalize_plurals = False).generate_from_frequencies(word_dict)
plt.imshow(worldcloud, interpolation = 'bilinear')
plt.axis("off")
plt.show()
         

## Calculating Sentence vectors

In [None]:
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)
sentence_vectors[:10]

## Generate similarity matrix from cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
len(sentences)

In [None]:
sim_mat = np.zeros([len(sentences), len(sentences)])
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1,100))[0,0]
sentences

In [None]:
sim_mat

## Implementing Pagerank algorithm

In [None]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
scores

In [None]:
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse = True)

for i in range(2):
    print(ranked_sentences[i][1])

## Extracting summary


In [None]:
summary = ""
for i in range(10):
    summary += re.sub('[[0-9*]+]','', ranked_sentences[i][1])
    summary = re.sub("\\n", "", summary)
    
summary