In [1]:
"""
Created on Mon Apr 30 18:46:02 2019

@author: Bhumika_Patoliya
"""

import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx


In [8]:
def read_article(file_name):
    #file_name ="text.txt"
    file = open(file_name, "r")
    sentences = []
    article =[]
    i =0

    filedata = file.readlines()
    for i in range(len(filedata)):
        art = filedata[i].split(". ")
        article = article + art

    for sentence in article:
        #print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences


In [9]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 


In [10]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix


In [39]:
def generate_summary(file_name, top_n=5):
    #nltk.download("stopwords")
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
    print("Similarity Matrix:\n", sentence_similarity_martix)
    
    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summary Text: \n", ". ".join([x.strip() for x in summarize_text]) + ".")


In [38]:
if __name__ == "__main__":
    generate_summary( "text_pos.txt", 2)

Similarity Matrix:
 [[0.         0.988399   0.99016367 ... 0.96159993 0.97317527 0.98222029]
 [0.988399   0.         0.98727843 ... 0.9545366  0.96844804 0.9793582 ]
 [0.99016367 0.98727843 0.         ... 0.95624081 0.97017708 0.98110673]
 ...
 [0.96159993 0.9545366  0.95624081 ... 0.         0.9380024  0.94856957]
 [0.97317527 0.96844804 0.97017708 ... 0.9380024  0.         0.96239405]
 [0.98222029 0.9793582  0.98110673 ... 0.94856957 0.96239405 0.        ]]


Summary Text: 
I  did  not  feel  rushed  one  bit  which  is  nice  because  I  have  visited  an  urgent  care  before  and  I  was  in  and  out  so  quickly  and  left  in  pain  exactly  as  I  was  before  with  no  explanation  for  why.I  definitely  recommend  this  office  for  anyone  in  need  of  health  services  and  wonderful  patient  services. Definitely  one  of  the  best  urgent  care  experiences  I've  had.

In [40]:
if __name__ == "__main__":
    generate_summary( "text_neg.txt", 2)

Similarity Matrix:
 [[0.         0.91520863 0.93665413 ... 0.94803414 0.         0.96088998]
 [0.91520863 0.         0.90971765 ... 0.92077039 0.23570226 0.93325653]
 [0.93665413 0.90971765 0.         ... 0.94234621 0.         0.95905548]
 ...
 [0.94803414 0.92077039 0.94234621 ... 0.         0.         0.96672935]
 [0.         0.23570226 0.         ... 0.         0.         0.        ]
 [0.96088998 0.93325653 0.95905548 ... 0.96672935 0.         0.        ]]


Summary Text: 
I  don't  do  that. About  a  month  later  I  receive  a  bill  in  the  mail  stating  that  I  never  paid  the  copay,  and  also  charging  me  an  additional  $35  for  a  referral  Dr  Lee  had  given  to  me,  which  I  did  not  even  ask  for! I  called  Ara  and  the  women  i  spoke  to  (who  was  very  unprofessional  btw)  said  there  is  nothing  they  can  do  and  they  will  not  remove  that  charge  from  the  invoice.

In [41]:
if __name__ == "__main__":
    generate_summary( "text_neu.txt", 2)

Similarity Matrix:
 [[0.         0.95923051 0.96723882 ... 0.96776208 0.95694875 0.97207427]
 [0.95923051 0.         0.95694267 ... 0.95746035 0.94676214 0.96172665]
 [0.96723882 0.95694267 0.         ... 0.97349933 0.9626219  0.97783709]
 ...
 [0.96776208 0.95746035 0.97349933 ... 0.         0.96314266 0.98040434]
 [0.95694875 0.94676214 0.9626219  ... 0.96314266 0.         0.96743427]
 [0.97207427 0.96172665 0.97783709 ... 0.98040434 0.96743427 0.        ]]


Summary Text: 
Finally,  the  doctor  came  in,  he  didn't  seem  he  had  a  clue  and  didn't  bother  to  read  any  of  the  details  in  my  chart  (which  doesn't  actually  have  much  on  there,  it  would  have  take  him  maybe  5  mins  to  look  over...)  He  rushed  me  out  in  a  mere  2  minutes,  did  not  answer  any  questions  even  though  I  wanted  to  get  some  information  on  getting  pregnant  (kind  of  important  for  someone  that  wants  to  become  a  1st-time  mommy)  I  left  with  nothing  bu