In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import tensorflow as tf

print(f"Tf version: {tf.__version__}")

Tf version: 2.0.0


In [8]:
import tensorflow_datasets as tfds

In [9]:
dataset, info = tfds.load('amazon_us_reviews/Mobile_Electronics_v1_00',
                          with_info=True)

train_dataset = dataset['train']

In [10]:
info

tfds.core.DatasetInfo(
    name='amazon_us_reviews',
    version=0.1.0,
    description='Amazon Customer Reviews (a.k.a. Product Reviews) is one of Amazons iconic products. In a period of over two decades since the first review in 1995, millions of Amazon customers have contributed over a hundred million reviews to express opinions and describe their experiences regarding products on the Amazon.com website. This makes Amazon Customer Reviews a rich source of information for academic researchers in the fields of Natural Language Processing (NLP), Information Retrieval (IR), and Machine Learning (ML), amongst others. Accordingly, we are releasing this data to further research in multiple disciplines related to understanding customer product experiences. Specifically, this dataset was constructed to represent a sample of customer evaluations and opinions, variation in the perception of a product across geographical regions, and promotional intent or bias in reviews.

Over 130+ million cus

In [11]:
BUFFER_SIZE = 30000
BATCH_SIZE = 128

In [12]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

In [13]:
for reviews in train_dataset.take(3):
    review_text = reviews['data']
    print(f"Review Text: {review_text.get('review_body').numpy()}")
    print(f"Star Rating: {review_text.get('star_rating').numpy()}")
    print(tf.where(review_text.get('star_rating') > 3, 1, 0).numpy())
    print("\n")

Review Text: b"First let me say the receiver is very good, the CD player is excellent and the LED display is readable. What's wrong? It's the worlds most complicated stereo with microscopic buttons. It's just not obvious, more like devious. I bought this mainly to match the blue guages in my 67 Mustang. I still haven't figured out how to keep it on just one color, yeh, you try it .. I gave up. There is no particular reason to love this. buy the cheaper ones and live with the red LEDs."
Star Rating: 3
0


Review Text: b"The mid-bass on these speakers are awesome but the tweeters are complete crap. They're not accurate at all and pierces through your ears. I'm going to have to find a different set of tweeters to run with the woofers."
Star Rating: 4
1


Review Text: b"Easy to connect and just the right length so it doesn't catch into anything. I'm hoping the Fiio does create a similar cable for the iPhone 5 so that I can use it with that device as well."
Star Rating: 5
1




In [14]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for _, reviews in train_dataset.enumerate():
    review_text = reviews['data']
    reviews_tokens = tokenizer.tokenize(review_text.get('review_body').numpy())
    vocabulary_set.update(reviews_tokens)

vocab_size = len(vocabulary_set)
print(vocab_size)

73738


In [15]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [16]:
encoder

<TokenTextEncoder vocab_size=73740>

In [17]:
for reviews in train_dataset.take(2):
    review_text = reviews["data"]
    print(review_text.get('review_body').numpy())
    encoded_example = encoder.encode(review_text.get("review_body").numpy())
    print(encoded_example)
    print("\n")

b"First let me say the receiver is very good, the CD player is excellent and the LED display is readable. What's wrong? It's the worlds most complicated stereo with microscopic buttons. It's just not obvious, more like devious. I bought this mainly to match the blue guages in my 67 Mustang. I still haven't figured out how to keep it on just one color, yeh, you try it .. I gave up. There is no particular reason to love this. buy the cheaper ones and live with the red LEDs."
[26769, 34520, 25343, 10058, 42387, 13282, 4457, 64255, 51957, 42387, 55497, 15292, 4457, 57143, 68966, 42387, 44517, 4556, 4457, 42158, 15652, 44186, 19679, 37972, 44186, 42387, 20392, 2931, 59914, 67137, 49003, 32571, 51475, 37972, 44186, 33074, 44395, 17910, 26681, 72566, 29491, 3302, 36360, 64477, 70434, 10643, 8842, 42387, 71263, 28262, 14547, 18769, 61812, 23118, 3302, 21830, 38874, 28530, 30196, 38029, 65714, 10643, 8654, 34563, 49957, 33074, 10250, 70421, 50138, 9451, 52522, 34563, 3302, 66385, 52112, 40115, 

In [18]:
for index in encoded_example:
    print(f"{index} --------> {encoder.decode([index])}")

64859 --------> The
53219 --------> mid
67207 --------> bass
49957 --------> on
8525 --------> these
21695 --------> speakers
18261 --------> are
10401 --------> awesome
26222 --------> but
42387 --------> the
50777 --------> tweeters
18261 --------> are
45262 --------> complete
70025 --------> crap
1747 --------> They
62861 --------> re
44395 --------> not
51511 --------> accurate
13277 --------> at
43711 --------> all
68966 --------> and
63301 --------> pierces
21853 --------> through
17921 --------> your
7486 --------> ears
3302 --------> I
26235 --------> m
24572 --------> going
10643 --------> to
63757 --------> have
10643 --------> to
70756 --------> find
15450 --------> a
12388 --------> different
69914 --------> set
27764 --------> of
50777 --------> tweeters
10643 --------> to
7997 --------> run
49003 --------> with
42387 --------> the
53937 --------> woofers
