# Text summarisation using nltk and cosine similarity

### Import important libraries

In [143]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from nltk.tokenize import sent_tokenize

### Create a function to read input file

In [144]:
def read_csfile(cs_file):
    file = open(cs_file, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []
    
    for sentence in article:
        print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    

    return sentences

### Create a function to analyse the sentence similarity

##### We first convert the sentences into vectors and then find cosine distance between them

In [145]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]  
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2)) #this line will give each distinct letter in the sentence
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

### Build a matrix of sentence similarity 

In [146]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

### Finally call all the above funtions in single function for call summary

In [166]:
def generate_call_summary(cs_file, top_n=2):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_csfile(cs_file)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
         summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summarize Text: \n", ". ".join(summarize_text))



In [167]:
generate_call_summary( "newsarticle.txt", top_n=2)

In an article in Cell, National Institutes of Health-funded researchers described how they used advanced genetic engineering techniques to transform a bacterial protein into a new research tool that may help monitor serotonin transmission with greater fidelity than current methods.Preclinical experiments, primarily in mice, showed that the sensor could detect subtle, real-time changes in brain serotonin levels during sleep, fear, and social interactions, as well as test the effectiveness of new psychoactive drugs.The study was funded, in part, by the NIH''s Brain Research through Advancing Innovative Neurotechnologies (BRAIN) Initiative which aims to revolutionize our understanding of the brain under healthy and disease conditions.The study was led by researchers in the lab of Lin Tian, PhD, principal investigator at the University of California Davis School of Medicine
Current methods can only detect broad changes in serotonin signaling
In this study, the researchers transformed a nut