# Automatic Summarization of Medical Articles Using Sum Basic
### Author: Rama Thamman

__Solution__: In this notebook we'll use Sum Basic for automatic summarization of medical articles. NIH's (National Institues for Health) PubMed repository consists of links to hundreds of thousands of medical articles. We will use articles relevant to various types of cancer. We will use the abstract of each article as the "ground truth". We will apply the Sum Basic algorithm to only the body of the PubMed article without the abstract to generate an extractive summary. We will use a Java based implementation of ROUGE software to evaluate the precision, recall and F1 score of extractive summary with respect to the ground truth. 

__Step 1: Import required modules__

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import operator
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
import re
from pyparsing import ZeroOrMore, Regex
import csv
import urllib2
from bs4 import BeautifulSoup
from nltk.stem.porter import *
import copy

__Step 2: Generate a list of documents__

In [None]:
urls = []

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=1994795')

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC314300/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=314300')

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4383356/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4383356')

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4596899/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4596899')

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4303126/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4303126')

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4637461/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4637461')

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4690355/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4690355')

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3505152/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=3505152')

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3976810/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=3976810')

#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4061037/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4061037')

__Step 3: Extract abstracts and document body__

In [None]:
documents = []
abstracts = []
texts = []
print 'Preprocessing documents. This may take a few minutes ...'
for i, url in enumerate(urls):
    print 'Preprocessing document %d ...' % (i+1)
    # Download the document
    my_url = urllib2.urlopen(url)
    raw_doc = BeautifulSoup(my_url.read(), 'xml')
    documents.append(raw_doc)

    # Extract the cleaned abstract
    raw_abstract = raw_doc.abstract
    my_abstract = re.sub(r'<\/?\w+>', r' ', str(raw_abstract)) # remove xml tags
    abstracts.append(my_abstract)

    # Extract the cleaned text
    text = raw_doc.find_all('sec') 
    text = re.sub(r'\\n', r' ', str(text)) # remove newline characters
    text = re.sub(r'<[^>]+>', r' ', str(text)) # remove xml tags
    text = re.sub(r'\[[^\[^\]]+\]', r' ', str(text)) # remove references
    text = re.sub(r'\[', r' ', str(text)) # remove any remaining [
    text = re.sub(r'\]', r' ', str(text)) # remove any remaining ]
    text = re.sub(r'[\s]{2,}', r' ', str(text)) # remove more than a single blank space
    text = re.sub(r'\.\s+,\s+\S', r' ', str(text)) # remove , after a period

    text = text.decode('utf-8')
    texts.append(text)

print 'All documents preprocessed successfully.'
print 'We have %d documents with %d abstracts and %d texts.' % (len(documents), len(abstracts), len(texts))
assert len(documents) == len(abstracts)
assert len(documents) == len(texts)

__Step 4: Split the documents into sentences__

In [None]:
punkttokenizer = PunktSentenceTokenizer()
text_sentences = []

for text in texts:
    sentences =  []
    seen = set()
    for sentence in punkttokenizer.tokenize(text):
        if sentence in seen:
            pass
        else:
            seen.add(sentence)
            sentences.append(sentence)
    text_sentences.append(sentences)



__Step 5: Configure stop words__

In [None]:
# Stop words
words_to_ignore = set(stopwords.words('english'))
words_to_ignore.add('[pubmed]')
words_to_ignore.add('[pmc free article]')
words_to_ignore.add('[cross ref]')
words_to_ignore.add('et')
words_to_ignore.add('al.')
words_to_ignore.add('figure')
words_to_ignore.add('fig')
words_to_ignore.add('fig.')

words_to_ignore_contains = ['doi','dg']
  


__Step 6: Extact content words__

In [None]:
doc_content_words = []
stemmer = PorterStemmer()
remove_stop_words=False
stem_words=False

def get_content_words(sentences):
    processed_words = []
    parser = ZeroOrMore(Regex(r'\[[^]]*\]') | Regex(r'"[^"]*"') | Regex(r'[^ ]+'))| Regex(r'"(^")*"')
    for ii in range(len(sentences)):
        words = parser.parseString(sentences[ii])
        for jj in range(len(words)):
            word = words[jj]
            word = word.lower()
            #remove stop words
            if remove_stop_words == True and word in words_to_ignore:
                continue
            # stem words
            if stem_words == True:
                word = stemmer.stem(word)
         
            word = canonicalize_word(word)
            if word != '':
                processed_words.append(word)
    return processed_words

def canonicalize_word(word):
    #check for words with just special characters
    if re.match(r'^[_\W]+$', word):
        word = ""
    #replace numbers with DG
    word = re.sub("\d+", "dg", word)
    if word.startswith("dg") and not any([c.isalpha() for c in word]):
        word = word.replace(",", "") # remove thousands separator
    #check for ignore words 
    for w in words_to_ignore_contains:
        if word.find(w) != -1:
            word = ""
            break;
    return word

# get content words for all documents
print 'Extracting content words for each document. This may take a few minutes ...'
for ii in range(len(text_sentences)):
    sentences = text_sentences[ii]
    c_words = get_content_words(sentences)
    doc_content_words.append(c_words)
    print "Word count for #",ii+1," - ",len(c_words)
    #print c_words

assert len(text_sentences), len(doc_content_words)
print 'All documents processed successfully.'


__Step 8: Compute word probability__

In [None]:
# Compute word probability
docs_word_probability = []
docs_word_frequency=[]
print 'Calculating word probability. This may take a few minutes ...'
for ii in range(len(text_sentences)):    
    content_words_freq = {}
    content_words_tf = {}
    words = doc_content_words[ii]
    for w in words:
        content_words_freq[w] = content_words_freq.get(w, 0) + 1  
    content_words_count = len(doc_content_words[ii])
    for (k, v) in content_words_freq.items():
        content_words_tf[k] = v/float(content_words_count)
    docs_word_probability.append(content_words_tf)
    docs_word_frequency.append(content_words_freq)
    
    #top5 = sorted(content_words_tf, key=content_words_tf.get, reverse=True)[:5]
    #for top_word in top5:
    #    print top_word," score:"+str(docs_word_probability[ii][top_word])," count:", docs_word_frequency[ii][top_word]
    #print "#" * 30
assert len(text_sentences), len(docs_word_probability)
assert len(text_sentences), len(docs_word_frequency)
print 'Done.'


__Step 8: Utility methods__

In [None]:
def print_best_stats(sentence, word_probability, words):
    score = float('-inf')
    best_word = ""
    for w in words:
        if word_probability[w] > score:
            score = word_probability[w]
            best_word = w
    print "best sentence: ",best_sentence
    print "best sentence words: ", words      
    print best_word ," ",score
    print "#" * 30
    
# Utility methods
def read_file(file_name):
    file = open(file_name, "r")
    doc = file.read()
    file.close()
    return doc

def write_to_file(score, file_name='output.txt'):
    with open(file_name, 'wb') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in score.items():
            values = value.split(',')        
            writer.writerow([key, values[0], values[1]])

__Step 8: Apply Sum Basic__

In [None]:
# Compute sentence ratings
num_sentence_summaries=10
min_content_words=5
doc_summary = []
verbose= False
write_log_to_file = False

# Iterate for each doucment
print 'Appying Sum Basic model. This may take a few minutes ...'
for ii in range(len(text_sentences)):
    print 'Processing doucment', ii+1
    sentences = copy.deepcopy(text_sentences[ii])
    content_words = doc_content_words[ii]
    word_probability = docs_word_probability[ii]
    ratings = {}
    
    # Iterate for number summaries required
    for jj in range(num_sentence_summaries):
        max_value = float("-inf")
        best_sentence_index = 0
        log = {}
        # Iterate for each sentence
        for kk, sentence in enumerate(sentences):
            words = get_content_words([sentence])
            word_freq_avg = 0
            if len(words) >= min_content_words:
                word_freq_sum = sum([word_probability[w] for w in words])
                word_freq_avg = word_freq_sum / len(words)
            if word_freq_avg > max_value:
                max_value = word_freq_avg
                best_sentence_index = kk
            if write_log_to_file:
                content_words_str = " ".join(list(words))
                log[sentence.encode('utf-8')]= str(word_freq_avg) + " , " + content_words_str

        if write_log_to_file:
            write_to_file(log, "log"+str(ii)+"-"+str(jj)+".csv")

        best_sentence = sentences.pop(best_sentence_index)
        ratings[best_sentence] = -len(ratings)
        
        words = get_content_words([best_sentence])
      
        if verbose:
            print_best_stats(best_sentence, word_probability, words)
            
        # update probability
        for w in words:
            word_probability[w] *= word_probability[w]
    doc_summary.append(ratings)
print 'All documents processed successfully.'



__Step 9: Write summaries and ground truth to file __

In [None]:
print 'Saving summaries from Sum Basic model. This may take a few minutes ...'
for ii in range(len(text_sentences)):
    #print 'Writing extractive summary for document %d ...' % (ii+1)
    sentence_ranking = doc_summary[ii]
    sorted_sentence_ranking = sorted(sentence_ranking.items(), key=operator.itemgetter(1), reverse=True)
    sorted_sentence_ranking_list = list(sorted_sentence_ranking) 
    out_file = '.\\rouge\\system\\article%d_system1.txt' % (ii+1)
    with open(out_file, 'w') as f:
        for jj in range (len(sorted_sentence_ranking_list)):
            f.write(sorted_sentence_ranking_list[jj][0])
    
for ii, abstract in enumerate(abstracts):    
    #print 'Writing ground truth for document %d ...' % (ii+1)
    out_file = 'rouge\\reference\\article%d_reference1.txt' % (ii+1)
    with open(out_file, 'w') as f:
        f.write(abstract.strip())
print 'All documents processed successfully.'

__Step 10: Calculate F Score using Rouge __

In [None]:
%cd rouge
!java -jar rouge2.0_0.2.jar
!type results.csv
%cd ..