## What is nltk ?

### -> stands for Natural Langauge Toolkit
### -> Programs for symbolic and statistical natural langauge processing (NLP)
### -> Lexical Analysis : Word and Text tokenizer
### -> Part-of-speech tagger
### -> Tree model and text chunker for capturing

In [116]:
import nltk

##### Cosine_distance function is used to measure the similarity between two sentences using vectors 

In [117]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vinit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [118]:
import numpy as np

In [119]:
import networkx as nx

#### read_article function is used to read the lines and creating sentences out of it

In [120]:
def read_article(file_name):
    sentences=[]
    file = open(file_name,'r')
    f_data = file.readlines()
    f_data = [x for x in f_data if x!='\n']
    f_data = [x.replace('\n','') for x in f_data]
    f_data = ''.join(f_data)
    article = f_data.split('. ')
    for sentence in article:
        sentences.append(sentence.replace("^[a-zA-Z0-9!@#$&()-'+,/\"]"," ").split(" "))
    return sentences
        



#### sentence_similarity function is used to check similarity between two sentences using cosine distance

In [121]:
def sentence_similarity(sent1,sent2,stopwords=None):
    if stopwords is None:
        stopwords=[]
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words=list(set(sent1+sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    for w in sent1:
         if w in stopwords:
                continue
         vector1[all_words.index(w)] +=1
    for w in sent2:
         if w in stopwords:
                 continue
         vector2[all_words.index(w)] +=1
    return 1-cosine_distance(vector1,vector2)

#### generate similarity matrix will create the similarity matrix of sentences

In [122]:
def gen_sim_matrix(sentences,stop_words):
    similarity_matrix=np.zeros((len(sentences),len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue
            similarity_matrix[idx1][idx2]=sentence_similarity(sentences[idx1],sentences[idx2],stop_words)
    return similarity_matrix

#### Below function is used to generate the summary of given text

In [123]:
def generate_summary(file_name,top_n=5):
    stop_words=stopwords.words('english')
    summarize_text=[]
    sentences = read_article(file_name)
    sentence_similarity_matrix=gen_sim_matrix(sentences,stop_words)
    sentence_similarity_graph=nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentence=sorted(((scores[i],s)for i,s in enumerate(sentences)),reverse=True)
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
    print("Summary \n",". ".join(summarize_text))