In [42]:
import nltk

In [43]:
corpus = """Hello Welcome, to Ami Gogia's NLP Tutorials.
Ami Gogia is gonna build a chatbot or he won't get a SOC certificate! XD"""
## Corpus = Paragraph
## Triple Quotes for multi-line string

In [44]:
documents = []
temp_str = ""
for ch in corpus:
    if ch in ['.', '!', '?']:
        documents.append(temp_str)
        temp_str = ""
    else:
        temp_str += ch

In [45]:
documents

["Hello Welcome, to Ami Gogia's NLP Tutorials",
 "\nAmi Gogia is gonna build a chatbot or he won't get a SOC certificate"]

In [46]:
documents = []
temp_str = ""
for ch in corpus:
    if ch in ['.', '!', '?']:
        temp_str += ch
        documents.append(temp_str)
        temp_str = ""
    elif ch == '\n':
        pass
    else:
        temp_str += ch

In [47]:
documents

["Hello Welcome, to Ami Gogia's NLP Tutorials.",
 "Ami Gogia is gonna build a chatbot or he won't get a SOC certificate!"]

In [48]:
def my_sent_tokenize(corpus):
    documents = []
    temp_str = ""
    for ch in corpus:
        if ch in ['.', '!', '?']:
            temp_str += ch
            documents.append(temp_str)
            temp_str = ""
        elif ch == '\n':
            pass
        elif ch == ' ' and temp_str == '':
            pass
        else:
            temp_str += ch
    if temp_str:
        documents.append(temp_str)
    return documents


In [49]:
from nltk.tokenize import sent_tokenize

In [50]:
help(sent_tokenize)

Help on function sent_tokenize in module nltk.tokenize:

sent_tokenize(text, language='english')
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus



In [51]:
## sent_tokenize converts para to sentences
sent_tokenize(corpus)

["Hello Welcome, to Ami Gogia's NLP Tutorials.",
 "Ami Gogia is gonna build a chatbot or he won't get a SOC certificate!",
 'XD']

In [52]:
my_sent_tokenize(corpus)

["Hello Welcome, to Ami Gogia's NLP Tutorials.",
 "Ami Gogia is gonna build a chatbot or he won't get a SOC certificate!",
 'XD']

In [53]:
sent_tokenize('Hi? Everything okay?')

['Hi?', 'Everything okay?']

In [54]:
from nltk.tokenize import word_tokenize
## word_tokenize does Sentence --> Words

In [55]:
documents = sent_tokenize(corpus)

In [56]:
word_tokenize(corpus)

['Hello',
 'Welcome',
 ',',
 'to',
 'Ami',
 'Gogia',
 "'s",
 'NLP',
 'Tutorials',
 '.',
 'Ami',
 'Gogia',
 'is',
 'gon',
 'na',
 'build',
 'a',
 'chatbot',
 'or',
 'he',
 'wo',
 "n't",
 'get',
 'a',
 'SOC',
 'certificate',
 '!',
 'XD']

In [57]:
for sent in documents:
    print(word_tokenize(sent))

['Hello', 'Welcome', ',', 'to', 'Ami', 'Gogia', "'s", 'NLP', 'Tutorials', '.']
['Ami', 'Gogia', 'is', 'gon', 'na', 'build', 'a', 'chatbot', 'or', 'he', 'wo', "n't", 'get', 'a', 'SOC', 'certificate', '!']
['XD']


In [58]:
from nltk.tokenize import wordpunct_tokenize
## "'s" also gets separated
## Ensures that punctuation is also treated as a separate word

In [59]:
wordpunct_tokenize(corpus)

['Hello',
 'Welcome',
 ',',
 'to',
 'Ami',
 'Gogia',
 "'",
 's',
 'NLP',
 'Tutorials',
 '.',
 'Ami',
 'Gogia',
 'is',
 'gonna',
 'build',
 'a',
 'chatbot',
 'or',
 'he',
 'won',
 "'",
 't',
 'get',
 'a',
 'SOC',
 'certificate',
 '!',
 'XD']

In [60]:
## TreeBank Tokenizer
from nltk.tokenize import TreebankWordTokenizer

In [None]:
tokenizer = TreebankWordTokenizer()
## Full stop not considered as a separate word/token
## except for the last word

In [62]:
tokenizer.tokenize(corpus)

['Hello',
 'Welcome',
 ',',
 'to',
 'Ami',
 'Gogia',
 "'s",
 'NLP',
 'Tutorials.',
 'Ami',
 'Gogia',
 'is',
 'gon',
 'na',
 'build',
 'a',
 'chatbot',
 'or',
 'he',
 'wo',
 "n't",
 'get',
 'a',
 'SOC',
 'certificate',
 '!',
 'XD']

In [63]:
corpus += '.'
tokenizer.tokenize(corpus)

['Hello',
 'Welcome',
 ',',
 'to',
 'Ami',
 'Gogia',
 "'s",
 'NLP',
 'Tutorials.',
 'Ami',
 'Gogia',
 'is',
 'gon',
 'na',
 'build',
 'a',
 'chatbot',
 'or',
 'he',
 'wo',
 "n't",
 'get',
 'a',
 'SOC',
 'certificate',
 '!',
 'XD',
 '.']

In [None]:
## Different Tokenizers have different word-splitting rules
## See which one fits your use case