-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpus.py
36 lines (24 loc) · 899 Bytes
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
__author__ = "Sanju Sci"
__email__ = "sanju.sci9@gmail.com"
__copyright__ = "Copyright 2019"
from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer
from nltk.corpus import gutenberg
class Corpus(object):
"""
We're going to talk about accessing these documents via NLTK.
As you can see, these are mostly text documents, so you could just use normal Python code to open and read
documents. That said, the NLTK module has a few nice methods for handling the corpus, so you may find it useful
to use their methology.
"""
def __init__(self, file_name):
self.sample = gutenberg.raw(file_name)
def process_content(self):
"""
:return:
"""
tok = sent_tokenize(self.sample)
for x in range(5):
print(tok[x])
if __name__ == "__main__":
tk = Corpus("bible-kjv.txt")
tk.process_content()