In [1]:
#import nltk

#nltk.download()

In [4]:
from nltk.tokenize import sent_tokenize

In [3]:
sample = """A sample refers to a smaller, manageable version of a larger group. 
It is a subset containing the characteristics of a larger population. 
Samples are used in statistical testing when population sizes are too large for the test to 
include all possible members or observations. The elements of a sample are known as sample points, sampling units or observations.
"""

In [4]:
tokenized_text = sent_tokenize(sample)

In [7]:
print(tokenized_text)

['A sample refers to a smaller, manageable version of a larger group.', 'It is a subset containing the characteristics of a larger population.', 'Samples are used in statistical testing when population sizes are too large for the test to \ninclude all possible members or observations.', 'The elements of a sample are known as sample points, sampling units or observations.']


# Gutenberg Corpus

In [1]:
from nltk.corpus import gutenberg

In [2]:
sample = gutenberg.raw("bible-kjv.txt")

In [5]:
tokenized_text = sent_tokenize(sample)

In [6]:
len(tokenized_text)

29812

In [8]:
tokenized_text[:5]

['[The King James Bible]\n\nThe Old Testament of the King James Bible\n\nThe First Book of Moses:  Called Genesis\n\n\n1:1 In the beginning God created the heaven and the earth.',
 '1:2 And the earth was without form, and void; and darkness was upon\nthe face of the deep.',
 'And the Spirit of God moved upon the face of the\nwaters.',
 '1:3 And God said, Let there be light: and there was light.',
 '1:4 And God saw the light, that it was good: and God divided the light\nfrom the darkness.']

# Word Tokenize

In [10]:
from nltk.tokenize import word_tokenize

In [11]:
from nltk.corpus import genesis

In [12]:
sample = genesis.raw("english-kjv.txt")

In [13]:
tok = word_tokenize(sample)

In [14]:
tok[0:10]

['In',
 'the',
 'beginning',
 'God',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth']

# Regexp Tokenize

In [15]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import gutenberg

In [16]:
regex_tokens = RegexpTokenizer("[0-9]\w+")

In [17]:
sample = gutenberg.raw("bible-kjv.txt")
regex_tokens.tokenize(sample)

['10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',

# BlankLine Tokenize

In [28]:
from nltk.tokenize import BlanklineTokenizer

In [46]:
sample = "Good muffins cost a lot\n in Bombay.    Please buy me\ntwo of them.\n\nThanks."

In [47]:
BlanklineTokenizer().tokenize(sample)

['Good muffins cost a lot\n in Bombay.    Please buy me\ntwo of them.',
 'Thanks.']

In [49]:
sent_tok = sent_tokenize(sample)

In [50]:
print(sent_tok)

['Good muffins cost a lot\n in Bombay.', 'Please buy me\ntwo of them.', 'Thanks.']
