-
Notifications
You must be signed in to change notification settings - Fork 48
/
prepare_text.py
91 lines (78 loc) · 3.64 KB
/
prepare_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import re
from collections import defaultdict
def main():
books = ["data/Rowling, J.K. - HP 1 - Harry Potter and the Sorcerer's Stone.txt",
"data/Rowling, J.K. - HP 2 - Harry Potter and the Chamber of Secrets.txt",
"data/Rowling, J.K. - HP 3 - Harry Potter and the Prisoner of Azkaban.txt",
"data/Rowling, J.K. - HP 4 - Harry Potter and the Goblet of Fire.txt",
"data/Rowling, J.K. - HP 5 - Harry Potter and the Order of the Phoenix.txt",
"data/Rowling, J.K. - HP 6 - Harry Potter and the Half-Blood Prince.txt",
"data/Rowling, J.K. - HP 7 - Harry Potter and the Deathly Hallows.txt"]
return extract_info(prepare_text(books))
def prepare_text(books):
pattern = ("(C H A P T E R (?:[A-Z-][ ]){2,}[A-Z]|"
"E P I L O G U E)\s+" +
# Group 1 selects the chapter number
"([A-Z \n',.-]+)\\b(?![A-Z]+(?=\.)\\b)" +
# Group 2 selects the chapter title but excludes all
# caps word beginning first sentence of the chapter
"(?![a-z']|[A-Z.])" +
# chapter title ends before lowercase letters or a period
"(.*?)" +
# Group 3 selects the chapter contents
"(?=C H A P T E R (?:[A-Z][ ]){2,}|"
"This\s+book\s+was\s+art\s+directed\s+|"
"E P I L O G U E)"
# chapter content ends with a new chapter or the end of book
)
hp = defaultdict(dict)
for book in books:
title = book[28:-4]
with open(book, 'r') as f:
text = (f.read().replace('’', "'")
.replace('‘', "'")
.replace('”', '"')
.replace('“', '"')
.replace('—', '—'))
chapters = re.findall(pattern, text, re.DOTALL)
chap = 0
for chapter in chapters:
chap += 1
chap_title = chapter[1].replace('\n', '')
chap_text = chapter[2][3:]
# Catch single chapter which begins with the following:
phrase = ' HE-WHO-MUST-NOT-BE-NAMED RETURNS'
if phrase in chap_title:
chap_title = chap_title.replace(phrase, '')
chap_text = phrase[1:] + ' I' + chap_text
chap_text = re.sub('\n*• [0-9]+ • \n*' +
chap_title +
' \n*',
'', chap_text,
flags=re.IGNORECASE)
chap_text = re.sub('\n*• [0-9]+ •\s*(CHAPTER [A-Z-]+\s*)'
'|(EPILOGUE)+\s*',
'',
chap_text)
chap_text = re.sub(' \n• [0-9]+ • \n*', '', chap_text)
# chap_text = re.sub('\n+', '\n', chap_text)
chap_text = re.sub('\s*'.join([word for word in
chap_title.split()]),
'',
chap_text)
hp[title]['Chapter ' + str(chap)] = (chap_title, chap_text)
hp = dict(hp)
# Correct the title of the epilogue
hp["Harry Potter and the Deathly Hallows"]['Epilogue'] = (
hp["Harry Potter and the Deathly Hallows"].pop('Chapter 37'))
return hp
def extract_info(hp_dict):
titles = []
texts = []
for book in hp_dict:
for chapter in hp_dict[book]:
titles.append(hp_dict[book][chapter][0])
texts.append(hp_dict[book][chapter][1])
return titles, texts
if __name__ == "__main__":
main()