# Text Summarization

In [1]:
import re
import string
import numpy as np
import matplotlib.pyplot as plt
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance
from operator import itemgetter
%matplotlib

Using matplotlib backend: Qt5Agg


In [2]:
sentences = brown.sents('ca01')

In [3]:
len(sentences)

98

In [4]:
[' '.join(sent) for sent in sentences]

["The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .",
 "The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .",
 "The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. .",
 "`` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .",
 "The jury said it did find that many of Georgia's registration and election laws `` are outmoded or inadequate and often ambiguous '' .",
 "It recommended that Fulton legisla

In [5]:
class TextCleaner():
    
    def __init__(self):
        self.stop_words = set(stopwords.words("english"))
        self.punctuations = set(string.punctuation)
        self.pos_tags = {
                NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'],
                VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
                ADJ: ['JJ', 'JJR', 'JJS'],
                ADV: ['RB', 'RBR', 'RBS', 'WRB']
        }


    def _remove_stop_words(self, words):
        return [w for w in words if w not in self.stop_words]
     
    
    def _remove_regex(self):
        self.input_sent = " ".join([w.lower() for w in self.input_sent])
        self.input_sent = re.sub(r"i'm", "i am", self.input_sent)
        self.input_sent = re.sub(r"he's", "he is", self.input_sent)
        self.input_sent = re.sub(r"she's", "she is", self.input_sent)
        self.input_sent = re.sub(r"that's", "that is", self.input_sent)
        self.input_sent = re.sub(r"what's", "what is", self.input_sent)
        self.input_sent = re.sub(r"where's", "where is", self.input_sent)
        self.input_sent = re.sub(r"\'ll", " will", self.input_sent)
        self.input_sent = re.sub(r"\'ve", " have", self.input_sent)
        self.input_sent = re.sub(r"\'re", " are", self.input_sent)
        self.input_sent = re.sub(r"\'d", " would", self.input_sent)
        self.input_sent = re.sub(r"won't", "will not", self.input_sent)
        self.input_sent = re.sub(r"can't", "cannot", self.input_sent)
        self.input_sent = re.sub(r"don't", "do not", self.input_sent)
        patterns = re.finditer("#[\w]*", self.input_sent)
        for pattern in patterns:
            self.input_sent = re.sub(pattern.group().strip(), "", self.input_sent)
        self.input_sent = "".join(ch for ch in self.input_sent if ch not in self.punctuations)
    
    
    def _tokenize(self):
        return word_tokenize(self.input_sent)
    
    
    def _process_content_for_pos(self, words):
        tagged_words = pos_tag(words)
        pos_words = []
        for word in tagged_words:
            flag = False
            for key, value in self.pos_tags.items():
                if word[1] in value:
                    pos_words.append((word[0], key))
                    flag = True
                    break
            if not flag:
                pos_words.append((word[0], NOUN))
        return pos_words
       
                 
    def _remove_noise(self):
        self._remove_regex()
        words = self._tokenize()
        noise_free_words = self._remove_stop_words(words)
        return noise_free_words
    
    
    def _normalize_text(self, words):
        lem = WordNetLemmatizer()
        pos_words = self._process_content_for_pos(words)
        normalized_words = [lem.lemmatize(w, pos=p) for w, p in pos_words]
        return normalized_words
    
    
    def clean_up(self, input_sent):
        self.input_sent = input_sent
        cleaned_words = self._remove_noise()
        cleaned_words = self._normalize_text(cleaned_words)
        return cleaned_words

## PageRank Algorithm

In [6]:
def pagerank(M, eps=1.0e-8, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    last_v = np.ones((N, 1), dtype=np.float32) * np.inf
    M_hat = (d * M) + (((1 - d) / N) * np.ones((N, N), dtype=np.float32))
    
    while np.linalg.norm(v - last_v, 2) > eps:
        last_v = v
        v = np.matmul(M_hat, v)
    return v

### Function to calculate cosine similarity among sentences

In [7]:
def sentence_similarity(sent1, sent2):
    text_cleaner = TextCleaner()
    
    sent1 = text_cleaner.clean_up(sent1)
    sent2 = text_cleaner.clean_up(sent2)
    
    all_words = list(set(sent1 + sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    for w in sent1:
        vector1[all_words.index(w)] += 1
    
    for w in sent2:
        vector2[all_words.index(w)] += 1
    
    return 1 - cosine_distance(vector1, vector2)

### Similarity Adjacency Matrix for PageRank

In [11]:
def build_similarity_matrix(sentences):
    S = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            else:
                S[i][j] = sentence_similarity(sentences[i], sentences[j])
    
    for i in range(len(S)):
        S[i] /= S[i].sum()
    return S

In [12]:
S = build_similarity_matrix(sentences)

In [13]:
S

array([[0.        , 0.03809933, 0.03196713, ..., 0.01191345, 0.01459093,
        0.        ],
       [0.04793482, 0.        , 0.01714968, ..., 0.01278262, 0.01565545,
        0.        ],
       [0.07569435, 0.03227618, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.02576081, 0.02196889, 0.        , ..., 0.        , 0.08413444,
        0.        ],
       [0.03354197, 0.02860469, 0.        , ..., 0.08944524, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [14]:
sentence_ranks = pagerank(S)

**Sentence Ranked according to their importance**

In [15]:
sentence_ranks

array([[0.01994852],
       [0.00999149],
       [0.00146002],
       [0.01123239],
       [0.01199025],
       [0.01242033],
       [0.01287994],
       [0.02045695],
       [0.004438  ],
       [0.0156535 ],
       [0.0034714 ],
       [0.0034006 ],
       [0.00560372],
       [0.00459101],
       [0.00961182],
       [0.01775453],
       [0.01078142],
       [0.0202485 ],
       [0.01011226],
       [0.00745932],
       [0.01356579],
       [0.00430531],
       [0.01579955],
       [0.01215066],
       [0.00972804],
       [0.01934347],
       [0.00581934],
       [0.01106451],
       [0.01095853],
       [0.00344289],
       [0.00228767],
       [0.0006165 ],
       [0.01527088],
       [0.00383506],
       [0.0139778 ],
       [0.00015244],
       [0.01010377],
       [0.01387291],
       [0.01411138],
       [0.0191344 ],
       [0.01856596],
       [0.00619747],
       [0.01827581],
       [0.00045634],
       [0.01984514],
       [0.00799657],
       [0.0009061 ],
       [0.008

In [16]:
ranked_sentence_indexes = [item[0] for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])]

In [17]:
ranked_sentence_indexes

[7,
 17,
 0,
 59,
 44,
 60,
 25,
 39,
 71,
 40,
 42,
 15,
 66,
 61,
 83,
 62,
 82,
 67,
 78,
 52,
 22,
 9,
 32,
 53,
 76,
 90,
 38,
 34,
 37,
 51,
 20,
 6,
 5,
 50,
 23,
 94,
 4,
 69,
 49,
 56,
 3,
 27,
 77,
 28,
 16,
 79,
 92,
 18,
 36,
 55,
 1,
 64,
 54,
 24,
 14,
 95,
 48,
 72,
 93,
 73,
 89,
 47,
 91,
 63,
 45,
 57,
 87,
 58,
 19,
 97,
 41,
 26,
 70,
 12,
 81,
 13,
 65,
 8,
 21,
 68,
 33,
 10,
 29,
 84,
 11,
 96,
 30,
 86,
 88,
 2,
 80,
 85,
 46,
 31,
 43,
 74,
 75,
 35]

In [45]:
plt.bar([item[0] for item in sorted(enumerate(sentence_ranks))], sentence_ranks.T[0])
plt.xlabel("Sentence No.")
plt.ylabel("Importance")
plt.show()

In [46]:
plt.plot([item[0] for item in sorted(enumerate(sentence_ranks))], sentence_ranks)
plt.xlabel("Sentence No.")
plt.ylabel("Importance")
plt.show()

In [47]:
SUMMARY_SIZE = 5

In [48]:
selected_sentences = sorted(ranked_sentence_indexes[:SUMMARY_SIZE])

In [49]:
selected_sentences

[0, 33, 58, 90, 93]

In [50]:
summary = itemgetter(*selected_sentences)(sentences)

**Generated Summary**

In [51]:
for sent in summary:
    print(' '.join(sent))

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
They have a son , William Berry Jr. , and a daughter , Mrs. J. M. Cheshire of Griffin .
Before adjournment Monday afternoon , the Senate is expected to approve a study of the number of legislators allotted to rural and urban areas to determine what adjustments should be made .
It was marked by controversy , anonymous midnight phone calls and veiled threats of violence .
Ordinary Williams said he , too , was subjected to anonymous calls soon after he scheduled the election .
