# Data

In [1]:
import pandas as pd
data = pd.read_csv("../../data/test.csv")
reviewsData = data[['id',
                  'reviews.doRecommend',
                  'reviews.rating',
                  'reviews.text',
                  'reviews.title']]
reviewsData.head(5)

Unnamed: 0,id,reviews.doRecommend,reviews.rating,reviews.text,reviews.title
0,AVqVGZNvQMlgsOJE6eUY,False,3,I thought it would be as big as small paper bu...,Too small
1,AVqVGZNvQMlgsOJE6eUY,True,5,This kindle is light and easy to use especiall...,Great light reader. Easy to use at the beach
2,AVqVGZNvQMlgsOJE6eUY,True,4,Didnt know how much i'd use a kindle so went f...,Great for the price
3,AVqVGZNvQMlgsOJE6eUY,True,5,I am 100 happy with my purchase. I caught it o...,A Great Buy
4,AVqVGZNvQMlgsOJE6eUY,True,5,Solid entry level Kindle. Great for kids. Gift...,Solid entry-level Kindle. Great for kids


# Tokenizing reviews

In [2]:
# Tokenizing the words within the reviews
from tensorflow.keras.preprocessing.text import Tokenizer
reviews = reviewsData['reviews.text'].values
tokenizer = Tokenizer(num_words = 5000)

The 'Tokenizer' class in the 'tensorflow.keras.preprocessing.text' module enables you to tokenize text. Tokenizing text is the process of breaking a text into tokens (usually individual words).

In [3]:
# Using dir(tokenizer) to see what attributes this object has
for a in dir(tokenizer): print(a)

__class__
__delattr__
__dict__
__dir__
__doc__
__eq__
__format__
__ge__
__getattribute__
__gt__
__hash__
__init__
__init_subclass__
__le__
__lt__
__module__
__ne__
__new__
__reduce__
__reduce_ex__
__repr__
__setattr__
__sizeof__
__str__
__subclasshook__
__weakref__
_keras_api_names
_keras_api_names_v1
char_level
document_count
filters
fit_on_sequences
fit_on_texts
get_config
index_docs
index_word
lower
num_words
oov_token
sequences_to_matrix
sequences_to_texts
sequences_to_texts_generator
split
texts_to_matrix
texts_to_sequences
texts_to_sequences_generator
to_json
word_counts
word_docs
word_index


## Tokenizing method

In [4]:
# Tokenizing texts
tokenizer.fit_on_texts(reviews)

## Informative attributes

In [5]:
# Extra function to print the first few elements of a collection
def printn(collection, n):
    for i, t in enumerate(collection):
        if i > n: break
        else: print(t)

print("\ntokenizer.word_index:")
printn(tokenizer.word_index.items(), 5)
print("\ntokenizer.word_counts:")
printn(tokenizer.word_counts.items(), 5)
print("\ntokenizer.word_docs:")
printn(tokenizer.word_docs.items(), 5)


tokenizer.word_index:
('i', 1)
('it', 2)
('a', 3)
('the', 4)
('my', 5)
('to', 6)

tokenizer.word_counts:
('i', 12)
('thought', 1)
('it', 10)
('would', 2)
('be', 3)
('as', 3)

tokenizer.word_docs:
('read', 2)
('recommend', 1)
('very', 1)
('would', 1)
('big', 1)
('on', 2)


Hence, we see that tokenization using the 'Tokenizer' class associates each word with an index, word count and the number of documents i.e. separate texts in which each word occurs. This data is stored in the attributes word_index, word_count and word_docs respectively.