In [1]:
import re
from collections import defaultdict

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
 
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
 
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer

In [2]:
books = ["data/Rowling, J.K. - HP 1 - Harry Potter and the Sorcerer's Stone.txt",
         "data/Rowling, J.K. - HP 2 - Harry Potter and the Chamber of Secrets.txt",
         "data/Rowling, J.K. - HP 3 - Harry Potter and the Prisoner of Azkaban.txt",
         "data/Rowling, J.K. - HP 4 - Harry Potter and the Goblet of Fire.txt",
         "data/Rowling, J.K. - HP 5 - Harry Potter and the Order of the Phoenix.txt",
         "data/Rowling, J.K. - HP 6 - Harry Potter and the Half-Blood Prince.txt",
         "data/Rowling, J.K. - HP 7 - Harry Potter and the Deathly Hallows.txt"]

In [23]:
pattern = ("(C H A P T E R [A-Z -]+)" +                  # Group 1 selects the chapter number
           "\n+([A-Z \n',.-]+)\\b(?![A-Z]+(?=\.)\\b)" +  # Group 2 selects the chapter title but excludes edgs of all caps word beginning first sentence of the chapter
           "(?![a-z']|[A-Z.])" +                         # chapter title ends with lowercase letters or a period
           "(.*?)" +                                     # Group 3 selects the chapter contents
           "(?=C H A P T E R|This book)")                # chapter contents ends with a new chapter or the end of book
hp = defaultdict(dict)
cnt = 0
for book in books:
    cnt += 1
    title = 'hp' + str(cnt)
    with open(book, 'r') as f:
        text = f.read().replace('&rsquo;',"'")
    chapters = re.findall(pattern, text, re.DOTALL)
    chap = 0
    for chapter in chapters:
        chap += 1
        chap_title = chapter[1].replace('\n','')
        chap_text = (chapter[2][3:].replace('&ldquo;', '"')
                                   .replace('&rdquo;', '"')
                                   .replace('&mdash;', '—'))
        chap_text = re.sub('\n*&bull; [0-9]+ &bull; \n*' + chap_title + ' \n*', '', chap_text, flags=re.IGNORECASE)
        chap_text = re.sub('\n*&bull; [0-9]+ &bull; \s*CHAPTER [A-Z]+ \s*', '', chap_text)
        chap_text = re.sub(' \n&bull; [0-9]+ &bull; \n*', '', chap_text)
        chap_text = re.sub('\n+', '\n', chap_text)
        hp[title]['Chapter ' + str(chap)] = (chap_title, chap_text)
hp = dict(hp)

### The format of the Harry Potter hp dictionary is as follows:
#### The first book is 'hp1', second is 'hp2', etc.
&nbsp;   
{'hp1': {  
         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 'Chapter 1': (chapter title, chapter text),  
         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 'Chapter 2': (chapter title, chapter text),  
         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 'Chapter 3': (chapter title, chapter text),  
         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ...  
         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; }  
 'hp2': {  
         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 'Chapter 1': (chapter title, chapter text),  
         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ...  
         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; }  
 &nbsp;...  
}  

In [24]:
language = "english"
sentence_count = 5
 
parser = PlaintextParser(hp['hp1']['Chapter 1'][1], Tokenizer(language))

In [25]:
from sumy.summarizers.lex_rank import LexRankSummarizer 
summarizer = LexRankSummarizer(Stemmer(language))
summarizer.stop_words = get_stop_words(language)
#Summarize the document with 2 sentences
summary = summarizer(parser.document, sentence_count) 
for sentence in summary:
    print(sentence)

"The Potters, that's right, that's what I heard —" "— yes, their son, Harry —" Mr. Dursley stopped dead.
Twelve times he clicked the Put-Outer, until the only lights left on the whole street were two tiny pinpricks in the distance, which were the eyes of the cat watching him.
Dumbledore slipped the Put-Outer back inside his cloak and set off down the street toward number four, where he sat down on the wall next to the cat.
"But I c-c-can't stand it — Lily an' James dead — an' poor little Harry off ter live with Muggles —" "Yes, yes, it's all very sad, but get a grip on yourself, Hagrid, or we'll be found," Professor McGonagall whispered, patting Hagrid gingerly on the arm as Dumbledore stepped over the low garden wall and walked to the front door.
Dumbledore turned and walked back down the street.


In [26]:
from sumy.summarizers.luhn import LuhnSummarizer
summarizer_1 = LuhnSummarizer(Stemmer(language))
summarizer_1.stop_words = get_stop_words(language)
summary_1 = summarizer_1(parser.document, sentence_count)
for sentence in summary_1:
    print(sentence)

It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs.
He didn't see the owls swoop- ing past in broad daylight, though people down in the street did; they pointed and gazed open-mouthed as owl after owl sped overhead.
No one knows why, or how, but they're saying that when he couldn't kill Harry Potter, Voldemort's power somehow broke — and that's why he's gone."
"But I c-c-can't stand it — Lily an' James dead — an' poor little Harry off ter live with Muggles —" "Yes, yes, it's all very sad, but get a grip on yourself, Hagrid, or we'll be found," Professor McGonagall whispered, patting Hagrid gingerly on the arm as Dumbledore stepped over the low garden wall and walked to the front door.
G'night, Professor McGonagall — Professor Dumbledore, sir."


In [27]:
from sumy.summarizers.lsa import LsaSummarizer
summarizer_2 = LsaSummarizer(Stemmer(language))
summarizer_2.stop_words = get_stop_words(language)
summary_2 = summarizer_2(parser.document, sentence_count)
for sentence in summary_2:
    print(sentence)

He dashed back across the road, hurried up to his office, snapped at his secretary not to disturb him, seized his telephone, and had almost finished dialing his home number when he changed his mind.
It seemed that Professor McGonagall had reached the point she was most anxious to discuss, the real reason she had been waiting on a cold, hard wall all day, for neither as a cat nor as a woman had she fixed Dumbledore with such a piercing stare as she did now.
He looked simply too big to be allowed, and so wild — long tangles of bushy black hair and beard hid most of his face, he had hands the size of trash can lids, and his feet in their leather boots were like baby dolphins.
For a full minute the three of them stood and looked at the little bundle; Hagrid's shoulders shook, Professor McGonagall blinked furiously, and the twinkling light that usually shone from Dumbledore's eyes seemed to have gone out.
A breeze ruffled the neat hedges of Privet Drive, which lay silent and tidy under the 

In [28]:
from sumy.summarizers.text_rank import TextRankSummarizer
summarizer_3 = TextRankSummarizer(Stemmer(language))
summarizer_3.stop_words = get_stop_words(language)
summary_3 = summarizer_3(parser.document, sentence_count)
for sentence in summary_3:
    print(sentence)

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.
They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made drills.
He was a big, beefy man with hardly any neck, although he did have a very large mustache.
Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.


  weights /= weights.sum(axis=1)[:, numpy.newaxis]


In [29]:
from sumy.summarizers.edmundson import EdmundsonSummarizer
summarizer_4 = EdmundsonSummarizer(Stemmer(language))
summarizer_4.stop_words = get_stop_words(language)
summarizer_4.bonus_words = hp['hp1']['Chapter 1'][0].split()
summarizer_4.stigma_words = ['zdfgthdvndadv']
summarizer_4.null_words = ['zdfgthdvndadv']
summary_4 = summarizer_4(parser.document, sentence_count)
for sentence in summary_4:
    print(sentence)

The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street.
When Dudley had been put to bed, he went into the living room in time to catch the last report on the evening news: "And finally, bird-watchers everywhere have reported that the nation's owls have been behaving very unusually today.
Twelve times he clicked the Put-Outer, until the only lights left on the whole street were two tiny pinpricks in the distance, which were the eyes of the cat watching him.
Dumbledore slipped the Put-Outer back inside his cloak and set off down the street toward number four, where he sat down on the wall next to the cat.
He couldn't know that at this very moment, people meeting in secret all over the country were holding up their glasses and saying in hushed voices: "To Harry Potter — the boy who lived!"


In [30]:
from sumy.summarizers.edmundson import EdmundsonSummarizer
summarizer_4 = EdmundsonSummarizer(Stemmer(language))
summarizer_4.stop_words = get_stop_words(language)
summarizer_4.bonus_words = ['zdfgthdvndadv']#hp['hp1']['Chapter 1'][0].split()
summarizer_4.stigma_words = ['zdfgthdvndadv']
summarizer_4.null_words = ['zdfgthdvndadv']
summary_4 = summarizer_4(parser.document, sentence_count)
for sentence in summary_4:
    print(sentence)

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.
They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made drills.
He was a big, beefy man with hardly any neck, although he did have a very large mustache.
He couldn't know that at this very moment, people meeting in secret all over the country were holding up their glasses and saying in hushed voices: "To Harry Potter — the boy who lived!"


In [31]:
from sumy.summarizers.sum_basic import SumBasicSummarizer
summarizer_5 = SumBasicSummarizer(Stemmer(language))
summarizer_5.stop_words = get_stop_words(language)
summary_5 = summarizer_5(parser.document, 5)
for sentence in summary_5:
    print(sentence)

Mr. Dursley wondered.
"Harry.
The cat was still there.
"It certainly seems so," said Dumbledore.
"Yes," said Professor McGonagall.


In [32]:
from sumy.summarizers.kl import KLSummarizer
summarizer_6 = KLSummarizer(Stemmer(language))
summarizer_6.stop_words = get_stop_words(language)
summary_6 = summarizer_6(parser.document, sentence_count)
for sentence in summary_6:
    print(sentence)

It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map.
It grew steadily louder as they looked up and down the street for some sign of a headlight; it swelled to a roar as they both looked up at the sky — and a huge motorcycle fell out of the air and landed on the road in front of them.
He looked simply too big to be allowed, and so wild — long tangles of bushy black hair and beard hid most of his face, he had hands the size of trash can lids, and his feet in their leather boots were like baby dolphins.
"But I c-c-can't stand it — Lily an' James dead — an' poor little Harry off ter live with Muggles —" "Yes, yes, it's all very sad, but get a grip on yourself, Hagrid, or we'll be found," Professor McGonagall whispered, patting Hagrid gingerly on the arm as Dumbledore stepped over the low garden wall and walked to the front door.
He clicked it once, and twelve balls of light sped back to their street lamps so that Privet Drive glowed

In [33]:
from sumy.summarizers.reduction import ReductionSummarizer
summarizer_7 = ReductionSummarizer(Stemmer(language))
summarizer_7.stop_words = get_stop_words(language)
summary_7 = summarizer_7(parser.document, sentence_count)
for sentence in summary_7:
    print(sentence)

Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be.
It seemed that Professor McGonagall had reached the point she was most anxious to discuss, the real reason she had been waiting on a cold, hard wall all day, for neither as a cat nor as a woman had she fixed Dumbledore with such a piercing stare as she did now.
Dumbledore took Harry in his arms and turned toward the Dursleys' house.
"But I c-c-can't stand it — Lily an' James dead — an' poor little Harry off ter live with Muggles —" "Yes, yes, it's all very sad, but get a grip on yourself, Hagrid, or we'll be found," Professor McGonagall whispered, patting Hagrid gingerly on the arm as Dumbledore stepped over the low garden wall and walked to the front door.
G'night, Professor McGonagall — Professor Dumbledore, sir."


In [34]:
from sumy.summarizers.lex_rank import LexRankSummarizer 
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.reduction import ReductionSummarizer

In [35]:
def summarize(text, summarizer, sentence_count, language='english'):
    summarizer = summarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)
    if isinstance(summarizer, EdmundsonSummarizer):
        summarizer.bonus_words = hp['hp1']['Chapter 1'][0].split()
        summarizer.stigma_words = ('and', 'the')
        summarizer.null_words = ('and', 'the')
    summary = summarizer(PlaintextParser(text, Tokenizer(language)).document, sentence_count)
    return summary

In [36]:
def print_summary(summary):
    for sentence in summary:
        print(sentence)

In [37]:
for summarizer in [LexRankSummarizer, LuhnSummarizer, LsaSummarizer, TextRankSummarizer,
                   EdmundsonSummarizer, SumBasicSummarizer, KLSummarizer, ReductionSummarizer]:
    print('----' + summarizer.__name__ + '----')
    print_summary(summarize(hp['hp1']['Chapter 1'][1], summarizer, sentence_count))
    print()

----LexRankSummarizer----
"The Potters, that's right, that's what I heard —" "— yes, their son, Harry —" Mr. Dursley stopped dead.
Twelve times he clicked the Put-Outer, until the only lights left on the whole street were two tiny pinpricks in the distance, which were the eyes of the cat watching him.
Dumbledore slipped the Put-Outer back inside his cloak and set off down the street toward number four, where he sat down on the wall next to the cat.
"But I c-c-can't stand it — Lily an' James dead — an' poor little Harry off ter live with Muggles —" "Yes, yes, it's all very sad, but get a grip on yourself, Hagrid, or we'll be found," Professor McGonagall whispered, patting Hagrid gingerly on the arm as Dumbledore stepped over the low garden wall and walked to the front door.
Dumbledore turned and walked back down the street.

----LuhnSummarizer----
It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs.
He didn't see the owls swoop-

  weights /= weights.sum(axis=1)[:, numpy.newaxis]


Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.
They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made drills.
He was a big, beefy man with hardly any neck, although he did have a very large mustache.
Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.

----EdmundsonSummarizer----
This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.
Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people!
He was sure there were lots of people called Potter who had a son called Harry.
"You don't mean — you can't mean the people who live

In [38]:
for chapter in hp['hp1']:
    print_summary(summarize(hp['hp1'][chapter][1], EdmundsonSummarizer, 2))

"You don't mean — you can't mean the people who live here?"
He couldn't know that at this very moment, people meeting in secret all over the country were holding up their glasses and saying in hushed voices: "To Harry Potter — the boy who lived!"
The room held no sign at all that another boy lived in the house, too.
Every year, Harry was left behind with Mrs. Figg, a mad old lady who lived two streets away.
No one, ever, in his whole life, had written to him.
The Dursleys' house had four bedrooms: one for Uncle Vernon and Aunt Petunia, one for visitors (usually Uncle Vernon's sister, Marge), one where Dudley slept, and one where Dudley kept all the toys and things that wouldn't fit into his first bedroom.
of Wizards ) Dear Mr. Potter, We are pleased to inform you that you have been accepted at Hogwarts School of Witchcraft and Wizardry.
Head boy an' girl at Hogwarts in their day!
I don't see why first years can't have their own.
"I'm not sayin' that's not a good idea, but yer not ter u