In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf

print(f"Tf version: {tf.__version__}")

Tf version: 2.2.0


In [3]:
import tensorflow_datasets as tfds

print(f"Tfds version: {tfds.__version__}")

Tfds version: 3.1.0


In [4]:
dataset, info = tfds.load('amazon_us_reviews/Mobile_Electronics_v1_00',
                          with_info=True)

train_dataset = dataset['train']

In [1]:
info

NameError: name 'info' is not defined

In [6]:
BUFFER_SIZE = 30000
BATCH_SIZE = 128

In [7]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

In [8]:
for reviews in train_dataset.take(3):
    review_text = reviews['data']
    print(f"Review Text: {review_text.get('review_body').numpy()}")
    print(f"Star Rating: {review_text.get('star_rating').numpy()}")
    print(tf.where(review_text.get('star_rating') > 3, 1, 0).numpy())
    print("\n")

Review Text: b'I needed a parrot type speaker for my car and that is exactly what i got! looks great came with everything i needed. Great Deal!!'
Star Rating: 5
1


Review Text: b"Ok, i choose this cover, because its notepad appearance. It's good looking, and very worth the money you pay for it.    The good :    - The cover is nice to the touch (Outside and Inside)  - Have a support on the back that when you don't use it keep itself locked through a magneto.  - Very stylish  - Nice color and texture  - Nice price    The bad : (and the reason why i put only 4 stars...)    - It seems that since the nook have a rubbery back, in a month of use, it has get stained with the interior of the cover. Not that it matters very much, since i don't get it out of its cover, but anyways, It should not get stained."
Star Rating: 4
1


Review Text: b'Helped my overall system, basically made my volume twice as loud instead of volume 10 (max 26) im running at 5. Im running all alpine S around with additio

In [9]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for _, reviews in train_dataset.enumerate():
    review_text = reviews['data']
    reviews_tokens = tokenizer.tokenize(review_text.get('review_body').numpy())
    vocabulary_set.update(reviews_tokens)

vocab_size = len(vocabulary_set)
print(vocab_size)

73738


In [10]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [11]:
encoder

<TokenTextEncoder vocab_size=73740>

In [12]:
for reviews in train_dataset.take(2):
    review_text = reviews["data"]
    print(review_text.get('review_body').numpy())
    print(review_text.get('review_body').dtype)
    print(f"Star rating: {review_text.get('star_rating').dtype}")
    encoded_example = encoder.encode(review_text.get("review_body").numpy())
    print(encoded_example)
    print("\n")

b'I needed a parrot type speaker for my car and that is exactly what i got! looks great came with everything i needed. Great Deal!!'
<dtype: 'string'>
Star rating: <dtype: 'int32'>
[4341, 55200, 39744, 72513, 53987, 11555, 37615, 20531, 61063, 10515, 1415, 68607, 50440, 19340, 64144, 55380, 50850, 34483, 59992, 67687, 48456, 64144, 55200, 42750, 7120]


b"Ok, i choose this cover, because its notepad appearance. It's good looking, and very worth the money you pay for it.    The good :    - The cover is nice to the touch (Outside and Inside)  - Have a support on the back that when you don't use it keep itself locked through a magneto.  - Very stylish  - Nice color and texture  - Nice price    The bad : (and the reason why i put only 4 stars...)    - It seems that since the nook have a rubbery back, in a month of use, it has get stained with the interior of the cover. Not that it matters very much, since i don't get it out of its cover, but anyways, It should not get stained."
<dtype: 'st

In [13]:
len(encoder.tokens)

73738

In [14]:
def encode(text_tensor, label_tensor):
    """Encodes dataset with the encoder.
    """
    # encode text
    encoded_text = encoder.encode(text_tensor.numpy())
    label = tf.where(label_tensor > 3, 1, 0)
    return encoded_text, label


In [15]:
def encode_map_fn(tensor):
    
    text = tensor['data'].get('review_body')
    label = tensor['data'].get('star_rating')
    
    encoded_text, label = tf.py_function(encode,
                                         inp=[text, label],
                                         Tout=(tf.int32, tf.int32))
    # set shapes for eager
    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label

In [16]:
# apply to train dataset

encoded_dataset = train_dataset.map(encode_map_fn)

In [17]:
for example, label in encoded_dataset.take(1):
    print(example)
    print(encoder.decode(example))
    print(label)

tf.Tensor(
[ 4341 55200 39744 72513 53987 11555 37615 20531 61063 10515  1415 68607
 50440 19340 64144 55380 50850 34483 59992 67687 48456 64144 55200 42750
  7120], shape=(25,), dtype=int32)
I needed a parrot type speaker for my car and that is exactly what i got looks great came with everything i needed Great Deal
tf.Tensor(1, shape=(), dtype=int32)


In [19]:
# split dataset into train/test

TEST_SIZE = 10_000

train_data = encoded_dataset.skip(TEST_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = encoded_dataset.take(TEST_SIZE).shuffle(BUFFER_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)


In [24]:
sample_text, sample_labels = next(iter(test_data))

print(sample_text[0])
print(sample_labels[0])

tf.Tensor([72902 43574 33862 ...     0     0     0], shape=(2025,), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)


In [29]:
sample_text.shape

TensorShape([128, 2025])

In [34]:
for f0, f1 in test_data.take(10):
    print(f1.shape, tf.unique_with_counts(f1)[2])

(128,) tf.Tensor([80 48], shape=(2,), dtype=int32)
(128,) tf.Tensor([46 82], shape=(2,), dtype=int32)
(128,) tf.Tensor([82 46], shape=(2,), dtype=int32)
(128,) tf.Tensor([43 85], shape=(2,), dtype=int32)
(128,) tf.Tensor([44 84], shape=(2,), dtype=int32)
(128,) tf.Tensor([84 44], shape=(2,), dtype=int32)
(128,) tf.Tensor([88 40], shape=(2,), dtype=int32)
(128,) tf.Tensor([85 43], shape=(2,), dtype=int32)
(128,) tf.Tensor([41 87], shape=(2,), dtype=int32)
(128,) tf.Tensor([84 44], shape=(2,), dtype=int32)


In [35]:
vocab_size

73738