In [1]:
from nltk import word_tokenize
import nltk

In [2]:
sentence = 'This is a single sentence.'
tokens = word_tokenize(sentence)
print(tokens)

['This', 'is', 'a', 'single', 'sentence', '.']


In [3]:
sentence = "The big dog is sleeping on the bed"
tokens = nltk.word_tokenize(sentence)
print(tokens)

['The', 'big', 'dog', 'is', 'sleeping', 'on', 'the', 'bed']


In [4]:
sentence = "The big dog is sleeping on the bed"
tokens = nltk.word_tokenize(sentence)
nltk.pos_tag(tokens)

[('The', 'DT'),
 ('big', 'JJ'),
 ('dog', 'NN'),
 ('is', 'VBZ'),
 ('sleeping', 'VBG'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('bed', 'NN')]

In [5]:
no_punctuation = [word.lower() for word in tokens if word.isalpha()]
print(no_punctuation)

['the', 'big', 'dog', 'is', 'sleeping', 'on', 'the', 'bed']


In [6]:
sentence = """This is the first sentence. 
This is the second sentence. 
A document contains many sentences."""
print(nltk.sent_tokenize(sentence))

['This is the first sentence.', 'This is the second sentence.', 'A document contains many sentences.']


In [7]:
print(nltk.wordpunct_tokenize(sentence))

['This', 'is', 'the', 'first', 'sentence', '.', 'This', 'is', 'the', 'second', 'sentence', '.', 'A', 'document', 'contains', 'many', 'sentences', '.']


In [8]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words[:45])
len(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am']


179

## Tokenization using Tensorflow

In [9]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [20]:
sentence = [
    'I love python',
    'I love Data Science',
    'You also love Data Science!',
    'Do you think Data Science is amazing?'
]

In [14]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
word_index

{'love': 1, 'i': 2, 'data': 3, 'science': 4, 'python': 5, 'you': 6, 'also': 7}

In [22]:
sequences = tokenizer.texts_to_sequences(sentence)
print(word_index)
print(sequences)

{'love': 1, 'i': 2, 'data': 3, 'science': 4, 'python': 5, 'you': 6, 'also': 7}
[[2, 1, 5], [2, 1, 3, 4], [6, 7, 1, 3, 4], [6, 3, 4]]


In [23]:
test_data = [
    'I really love Data science',
    'Data science loves mathematics '
]

In [25]:
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)
print(word_index)

[[2, 1, 3, 4], [3, 4]]
{'love': 1, 'i': 2, 'data': 3, 'science': 4, 'python': 5, 'you': 6, 'also': 7}


In [26]:
sentence = [
    'I love python',
    'I love Data Science',
    'You also love Data Science!',
    'Do you think Data Science is amazing?'
]

tokenizer = Tokenizer(num_words=100,oov_token="<OOV>")
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentence)
print(word_index)
print(sequences)

{'<OOV>': 1, 'love': 2, 'data': 3, 'science': 4, 'i': 5, 'you': 6, 'python': 7, 'also': 8, 'do': 9, 'think': 10, 'is': 11, 'amazing': 12}
[[5, 2, 7], [5, 2, 3, 4], [6, 8, 2, 3, 4], [9, 6, 10, 3, 4, 11, 12]]


In [27]:
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)
print(word_index)

[[5, 1, 2, 3, 4], [3, 4, 1, 1]]
{'<OOV>': 1, 'love': 2, 'data': 3, 'science': 4, 'i': 5, 'you': 6, 'python': 7, 'also': 8, 'do': 9, 'think': 10, 'is': 11, 'amazing': 12}
