# Tokenization 

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sen = [
    'I am good human as i am with good dog'
]

In [3]:
tokenizer = Tokenizer(num_words = 100)

# This class allows to vectorize a text corpus, by turning each text into either a sequence of integers 

## The Number of Words That it can Tokenize.

#### the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept.
#f.keras.preprocessing.text.Tokenizer(num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', char_level=False, oov_token=None,
    document_count=0, **kwargs
)

In [4]:
tokenizer.fit_on_texts(sen)

# fit_on_texts will create a tokenized word index printing a set of KEY/Value pairs for the words.

In [5]:
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'am': 2, 'good': 3, 'human': 4, 'as': 5, 'with': 6, 'dog': 7}


In [6]:
tokenizer.get_config()

{'num_words': 100,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 1,
 'word_counts': '{"i": 2, "am": 2, "good": 2, "human": 1, "as": 1, "with": 1, "dog": 1}',
 'word_docs': '{"as": 1, "i": 1, "am": 1, "with": 1, "human": 1, "good": 1, "dog": 1}',
 'index_docs': '{"5": 1, "1": 1, "2": 1, "6": 1, "4": 1, "3": 1, "7": 1}',
 'index_word': '{"1": "i", "2": "am", "3": "good", "4": "human", "5": "as", "6": "with", "7": "dog"}',
 'word_index': '{"i": 1, "am": 2, "good": 3, "human": 4, "as": 5, "with": 6, "dog": 7}'}

In [7]:
type(sen)

list

In [8]:
Sen1 = [
    'Today is a sunny day',
    'Today is a autum day',
    'what is it like today'
]

In [9]:
tokenizer = Tokenizer(num_words = 100)

In [10]:
tokenizer.fit_on_texts(Sen1)
word_index = tokenizer.word_index
print(word_index)

{'today': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'autum': 6, 'what': 7, 'it': 8, 'like': 9}


In [11]:
sen = [
    'How are % today.',
    'I love $ money:'
]

In [12]:
tokenizer_1 = Tokenizer(num_words = 100, filters = '')
tokenizer_1.fit_on_texts(sen)
word_index_1 = tokenizer_1.word_index
print(word_index_1)

{'how': 1, 'are': 2, '%': 3, 'today.': 4, 'i': 5, 'love': 6, '$': 7, 'money:': 8}


In [13]:
tokenizer_1 = Tokenizer(num_words = 100, filters = '$%')
tokenizer_1.fit_on_texts(sen)
word_index_1 = tokenizer_1.word_index
print(word_index_1)

{'how': 1, 'are': 2, 'today.': 3, 'i': 4, 'love': 5, 'money:': 6}


In [14]:
sequences = tokenizer.texts_to_sequences(Sen1)
print(sequences)

[[1, 2, 3, 5, 4], [1, 2, 3, 6, 4], [7, 2, 8, 9, 1]]


In [15]:
print(word_index)

{'today': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'autum': 6, 'what': 7, 'it': 8, 'like': 9}


# Tokenization in nltk api

In [16]:
import nltk
# import WhitespaceTokenizer() method from nltk
from nltk.tokenize import WhitespaceTokenizer
     
# Create a reference variable for Class WhitespaceTokenizer
wstk = WhitespaceTokenizer()
     
# Create a string input
text1 = "The price\t of burger \nin klu is Rs.136.\n"
     
# Use tokenize method
tokens_text1 = wstk.tokenize(text1)
     
print(tokens_text1)

['The', 'price', 'of', 'burger', 'in', 'klu', 'is', 'Rs.136.']


In [17]:
text2 = "Natural language processing (NLP) is a field " + \
       "of computer science, artificial intelligence " + \
       "and computational linguistics concerned with " + \
       "the interactions between computers and human " + \
       "(natural) languages, and, in particular, " + \
       "concerned with programming computers to " + \
       "fruitfully process large natural language " + \
       "corpora. Challenges in natural language " + \
       "processing frequently involve natural " + \
       "language understanding, natural language" + \
       "generation frequently from formal, machine" + \
       "-readable logical forms), connecting language " + \
       "and machine perception, managing human-" + \
       "computer dialog systems, or some combination " + \
       "thereof."

In [18]:
tokens_text2 = wstk.tokenize(text2)   
print(tokens_text2)

['Natural', 'language', 'processing', '(NLP)', 'is', 'a', 'field', 'of', 'computer', 'science,', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(natural)', 'languages,', 'and,', 'in', 'particular,', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora.', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding,', 'natural', 'languagegeneration', 'frequently', 'from', 'formal,', 'machine-readable', 'logical', 'forms),', 'connecting', 'language', 'and', 'machine', 'perception,', 'managing', 'human-computer', 'dialog', 'systems,', 'or', 'some', 'combination', 'thereof.']


In [19]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [20]:
#word_tokenize(text1)
word_tokenize(text2)

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'field',
 'of',
 'computer',
 'science',
 ',',
 'artificial',
 'intelligence',
 'and',
 'computational',
 'linguistics',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 '(',
 'natural',
 ')',
 'languages',
 ',',
 'and',
 ',',
 'in',
 'particular',
 ',',
 'concerned',
 'with',
 'programming',
 'computers',
 'to',
 'fruitfully',
 'process',
 'large',
 'natural',
 'language',
 'corpora',
 '.',
 'Challenges',
 'in',
 'natural',
 'language',
 'processing',
 'frequently',
 'involve',
 'natural',
 'language',
 'understanding',
 ',',
 'natural',
 'languagegeneration',
 'frequently',
 'from',
 'formal',
 ',',
 'machine-readable',
 'logical',
 'forms',
 ')',
 ',',
 'connecting',
 'language',
 'and',
 'machine',
 'perception',
 ',',
 'managing',
 'human-computer',
 'dialog',
 'systems',
 ',',
 'or',
 'some',
 'combination',
 'thereof',
 '.']

In [21]:
sent_tokenize(text1)
sent_tokenize(text2)

['Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.',
 'Challenges in natural language processing frequently involve natural language understanding, natural languagegeneration frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.']

In [22]:
from nltk.tokenize import TreebankWordTokenizer
TreebankWordTokenizer().tokenize(text2)

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'field',
 'of',
 'computer',
 'science',
 ',',
 'artificial',
 'intelligence',
 'and',
 'computational',
 'linguistics',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 '(',
 'natural',
 ')',
 'languages',
 ',',
 'and',
 ',',
 'in',
 'particular',
 ',',
 'concerned',
 'with',
 'programming',
 'computers',
 'to',
 'fruitfully',
 'process',
 'large',
 'natural',
 'language',
 'corpora.',
 'Challenges',
 'in',
 'natural',
 'language',
 'processing',
 'frequently',
 'involve',
 'natural',
 'language',
 'understanding',
 ',',
 'natural',
 'languagegeneration',
 'frequently',
 'from',
 'formal',
 ',',
 'machine-readable',
 'logical',
 'forms',
 ')',
 ',',
 'connecting',
 'language',
 'and',
 'machine',
 'perception',
 ',',
 'managing',
 'human-computer',
 'dialog',
 'systems',
 ',',
 'or',
 'some',
 'combination',
 'thereof',
 '.']

# The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.

This tokenizer performs the following steps:

split standard contractions, e.g. don't -> do n't and they'll -> they 'll

treat most punctuation characters as separate tokens

split off commas and single quotes, when followed by whitespace

separate periods that appear at the end of line

# Tokenization using gensim

In [23]:
#import gensim
#from gensim.utils import tokenize
#list(tokenize(text2))

In [24]:
sentences = [
    'Today is a sunny day',
    'today is a Windy Day',
    'Is it sunny Today',
    'I really enjoyed walking in the snow today'
]

In [25]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
word_index = tokenizer.word_index
print(word_index)
print(sequences)

{'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'windy': 6, 'it': 7, 'i': 8, 'really': 9, 'enjoyed': 10, 'walking': 11, 'in': 12, 'the': 13, 'snow': 14}
[[1, 2, 3, 4, 5], [1, 2, 3, 6, 5], [2, 7, 4, 1], [8, 9, 10, 11, 12, 13, 14, 1]]


In [26]:
len(sequences[0]),len(sequences[1]),len(sequences[2]),len(sequences[3])

(5, 5, 4, 8)

# Let's Make Them into a Same Shape Vectors

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_seq = pad_sequences(sequences)
print(padded_seq)

[[ 0  0  0  1  2  3  4  5]
 [ 0  0  0  1  2  3  6  5]
 [ 0  0  0  0  2  7  4  1]
 [ 8  9 10 11 12 13 14  1]]


In [28]:
len(padded_seq[0]),len(padded_seq[1]),len(padded_seq[2]),len(padded_seq[3])

(8, 8, 8, 8)

In [32]:
padded_seq = pad_sequences(sequences, padding = 'post', maxlen = 8, truncating = 'post')
print(padded_seq)

[[ 1  2  3  4  5  0  0  0]
 [ 1  2  3  6  5  0  0  0]
 [ 2  7  4  1  0  0  0  0]
 [ 8  9 10 11 12 13 14  1]]


# Removing STOP words and CLEANING Text
# Stopwords * Means
### Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. For example, the words like the, he, have etc

In [33]:
from nltk.corpus import stopwords

In [34]:
print(stopwords.words('english')) # spanish, turkish, russian

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# If these words are encountered ...... How to remove them using code

In [38]:
sentence = 'please guys, i hate to tell you so many times that this is my life'
words = sentence.split()
stop_words = set(stopwords.words("english"))
type(words)
filtered_sentence = ""
for word in words:
    if word not in stop_words:
        filtered_sentence = filtered_sentence + word + " "
sentences.append(filtered_sentence)

In [39]:
sentence

'please guys, i hate to tell you so many times that this is my life'

In [40]:
filtered_sentence

'please guys, hate tell many times life '

In [41]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize, TweetTokenizer, MWETokenizer