In [72]:
from keras.preprocessing.text import text_to_word_sequence


In [73]:
# define the document
text = 'The quick brown fox jumped over the lazy dog.'


In [74]:
# tokenize the document
result = text_to_word_sequence(text)
print(result)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


In [75]:
# Example of preparing a vocabulary.

In [76]:
# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))


In [77]:
words

{'brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the'}

In [78]:
vocab_size = len(words)
print(vocab_size)

8


In [79]:
# We can put this together with the one hot() function and encode the words in the document.
# The complete example is listed below. The vocabulary size is increased by one-third to minimize
# collisions when hashing words.

In [80]:
from keras.preprocessing.text import one_hot

In [81]:
# integer encode the document
result = one_hot(text, round(vocab_size*1.3))
print(result)

[8, 1, 4, 9, 1, 6, 8, 8, 9]


In [82]:
from keras.preprocessing.text import hashing_trick

In [83]:
# integer encode the document
result = hashing_trick(text, round(vocab_size*1.3), hash_function='md5')
print(result)

[6, 4, 1, 2, 7, 5, 6, 2, 6]


In [84]:
# Tokenizer API

In [85]:
from keras.preprocessing.text import Tokenizer


In [86]:
# define 5 documents
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!']


In [87]:
# create the tokenizer
t = Tokenizer()


In [88]:
# fit the tokenizer on the documents
t.fit_on_texts(docs)

In [89]:
t

<keras_preprocessing.text.Tokenizer at 0x1a67fcc8a90>

In [90]:
# summarize what was learned


In [91]:
# dictionary of words and their counts

In [92]:
print(t.word_counts)


OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])


In [93]:
# A dictionary of words and how many documents each appeared in

In [94]:
print(t.document_count)


5


In [95]:
# A dictionary of words and their uniquely assigened integers

In [96]:
print(t.word_index)


{'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}


In [97]:
# An integer count of the total no of documents that were used to fit the Tokenizer

In [98]:
print(t.word_docs)

defaultdict(<class 'int'>, {'well': 1, 'done': 1, 'good': 1, 'work': 2, 'effort': 1, 'great': 1, 'nice': 1, 'excellent': 1})


In [99]:
len(t.word_index)

8

In [100]:
t.word_index

{'work': 1,
 'well': 2,
 'done': 3,
 'good': 4,
 'great': 5,
 'effort': 6,
 'nice': 7,
 'excellent': 8}

In [101]:
columns = list()

for k , c in t.word_index.items():
    col = k
    # print(col)
    columns.append(col) 
    

In [102]:
columns

['work', 'well', 'done', 'good', 'great', 'effort', 'nice', 'excellent']

In [103]:
empty_col = ['']

In [104]:
columns = empty_col+columns

In [105]:
columns

['', 'work', 'well', 'done', 'good', 'great', 'effort', 'nice', 'excellent']

In [106]:
# define 5 documents
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!']


In [107]:
# Different modes in texts_to_matrix

In [108]:
# count == the count of each word in the document

In [109]:
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='count')
print(encoded_docs)

[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [110]:
import pandas as pd

In [111]:
df_count = pd.DataFrame(encoded_docs)

In [112]:
df_count

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [113]:
df_count.columns = columns

In [114]:
df_count.index = ['doc 0','doc 1','doc 2','doc 3','doc 4']

In [115]:
df_count

Unnamed: 0,Unnamed: 1,work,well,done,good,great,effort,nice,excellent
doc 0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
doc 1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
doc 2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
doc 3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
doc 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [116]:
# binary == whether or not  each word is present in the document

In [117]:
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='binary')
print(encoded_docs)

[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [118]:
df_binary = pd.DataFrame(encoded_docs)

In [119]:
df_binary

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [120]:
df_binary.columns = columns

In [121]:
df_binary.index = ['doc 0','doc 1','doc 2','doc 3','doc 4']

In [122]:
df_binary

Unnamed: 0,Unnamed: 1,work,well,done,good,great,effort,nice,excellent
doc 0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
doc 1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
doc 2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
doc 3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
doc 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [123]:

# tfidf == TF-IDF scoring for each word in document

In [124]:
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='tfidf')
print(encoded_docs)

[[0.         0.         1.25276297 1.25276297 0.         0.
  0.         0.         0.        ]
 [0.         0.98082925 0.         0.         1.25276297 0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         1.25276297
  1.25276297 0.         0.        ]
 [0.         0.98082925 0.         0.         0.         0.
  0.         1.25276297 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.25276297]]


In [125]:
df_tfidf  = pd.DataFrame(encoded_docs)

In [126]:
df_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,1.252763,1.252763,0.0,0.0,0.0,0.0,0.0
1,0.0,0.980829,0.0,0.0,1.252763,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.252763,1.252763,0.0,0.0
3,0.0,0.980829,0.0,0.0,0.0,0.0,0.0,1.252763,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.252763


In [127]:
df_tfidf.index = ['doc 0','doc 1','doc 2','doc 3','doc 4']

In [128]:
df_tfidf.columns = columns

In [129]:
df_tfidf

Unnamed: 0,Unnamed: 1,work,well,done,good,great,effort,nice,excellent
doc 0,0.0,0.0,1.252763,1.252763,0.0,0.0,0.0,0.0,0.0
doc 1,0.0,0.980829,0.0,0.0,1.252763,0.0,0.0,0.0,0.0
doc 2,0.0,0.0,0.0,0.0,0.0,1.252763,1.252763,0.0,0.0
doc 3,0.0,0.980829,0.0,0.0,0.0,0.0,0.0,1.252763,0.0
doc 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.252763


In [130]:
# freq == the frequency of each word as a ratio of words within each document

In [131]:
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='freq')
print(encoded_docs)

[[0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.5 0.  0.  0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.5 0.  0.  0.  0.  0.  0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


In [132]:
df_freq = pd.DataFrame(encoded_docs)

In [133]:
df_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0
1,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0
3,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [134]:
df_freq.index = ['doc 0','doc 1','doc 2','doc 3','doc 4']

In [135]:
df_freq.columns = columns

In [136]:
df_freq

Unnamed: 0,Unnamed: 1,work,well,done,good,great,effort,nice,excellent
doc 0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0
doc 1,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0
doc 2,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0
doc 3,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0
doc 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
