<a href="https://colab.research.google.com/github/see-3pO/Learning_Tensorflow/blob/master/Tensorflow_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TensorFlow Datasets is a collection of datasets ready to use, with TensorFlow or other Python ML frameworks, such as Jax.

In [None]:
#  basic imports
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras import layers
import tensorflow_datasets as tfds

### Image Loading with TFDS

In [None]:
# load the mnist dataset
(ds_train, ds_test), ds_info = tfds.load("mnist",
                                         split=["train", "test"],
                                         as_supervised=True, # return (image, label) otherwise returns dict
                                         shuffle_files=True,
                                         with_info=True,
                                        )

Downloading and preparing dataset 11.06 MiB (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...


Dl Completed...:   0%|          | 0/5 [00:00<?, ? file/s]

Dataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.


shuffle_files: used to specify  whether the files that make up a dataset should be shuffled before being loaded and processed. Normally, large datasets are split across multiple files.


In [None]:
print(ds_info)

tfds.core.DatasetInfo(
    name='mnist',
    full_name='mnist/3.0.1',
    description="""
    The MNIST database of handwritten digits.
    """,
    homepage='http://yann.lecun.com/exdb/mnist/',
    data_dir='/root/tensorflow_datasets/mnist/3.0.1.incompleteFS70XA',
    file_format=tfrecord,
    download_size=11.06 MiB,
    dataset_size=21.00 MiB,
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=uint8),
        'label': ClassLabel(shape=(), dtype=int64, num_classes=10),
    }),
    supervised_keys=('image', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=10000, num_shards=1>,
        'train': <SplitInfo num_examples=60000, num_shards=1>,
    },
    citation="""@article{lecun2010mnist,
      title={MNIST handwritten digit database},
      author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
      journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
      volume={2},
      year={2010}
    }""",

In [None]:
def normalize_image(image, label):
  # normalize the image
  return tf.cast(image, tf.float32)/255.0, label

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 64

In [None]:
# train set
ds_train = ds_train.map(normalize_image, num_parallel_calls=AUTOTUNE)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
ds_train = ds_train.batch(BATCH_SIZE)
ds_train = ds_train.prefetch(AUTOTUNE)

# test set
ds_test = ds_test.map(normalize_image, num_parallel_calls=AUTOTUNE)
ds_test = ds_test.batch(128)
ds_test = ds_test.prefetch(AUTOTUNE)

In [None]:
# creating a model
model = keras.Sequential([
    keras.Input((28, 28, 1)),
    layers.Conv2D(32, 3, activation='relu'),
    layers.Flatten(),
    layers.Dense(10),
])

In [None]:
model.compile(
    optimizer = keras.optimizers.Adam(learning_rate=0.001),
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = ['accuracy'],
)

In [None]:
model.fit(ds_train, epochs=5, verbose=2)
model.evaluate(ds_test)

Epoch 1/5
938/938 - 23s - loss: 0.2051 - accuracy: 0.9420 - 23s/epoch - 24ms/step
Epoch 2/5
938/938 - 15s - loss: 0.0733 - accuracy: 0.9786 - 15s/epoch - 16ms/step
Epoch 3/5
938/938 - 15s - loss: 0.0542 - accuracy: 0.9839 - 15s/epoch - 16ms/step
Epoch 4/5
938/938 - 15s - loss: 0.0416 - accuracy: 0.9872 - 15s/epoch - 16ms/step
Epoch 5/5
938/938 - 15s - loss: 0.0332 - accuracy: 0.9902 - 15s/epoch - 16ms/step


[0.0618802048265934, 0.9811000227928162]

### Text Loading with TDFS

In [None]:
(ds_train, ds_test), ds_info = tfds.load(
    'imdb_reviews',
    split = ['train', 'test'],
    shuffle_files = True,
    as_supervised = True,
    with_info = True
)

print(ds_info)

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='/root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
    

In [None]:
# tokenization
# tokenizer = tfds.features.text.Tokenizer()
# above is deprecated but can still be accessed using
tokenizer = tfds.deprecated.text.Tokenizer()

```python
import collections

word_counter = collections.Counter()

```
- Purpose: To count the occurrences of each word across the entire dataset. collections.Counter is a specialized dictionary for counting hashable objects.

- Reason: It provides a convenient way to count word frequencies and can be easily updated with new data.

```python
for text, _ in dataset:
    words = tokenizer.tokenize(text.numpy().decode('utf-8').lower())
    word_counter.update(words)

```
`.decode('utf-8')`: Decodes the byte string to a regular string. This is necessary because TensorFlow datasets often store strings as byte strings.

In [None]:
import collections

def build_vocabulary(dataset, tokenizer, min_count):
  # initialize a counter to count word occurrences
  word_counter = collections.Counter()

  # iterate over the dataset and update word counts
  for text, _ in ds_train:
    words = tokenizer.tokenize(text.numpy().decode('utf-8').lower())
    word_counter.update(words)

  # build the vocabulary set with words that meet the min count
  vocabulary = { word for word, count in word_counter.items() if count>= min_count}

  return vocabulary

In [None]:
vocabulary = build_vocabulary(ds_train, tokenizer, 5)
print(len(vocabulary))

29114


In [None]:
# convert the tokenized words into numerical format(interger IDs) using an encoder
# oov_token: out of vocabulary token
# <UNK> stands for unknown. If word in your text is not in the vocabulary, it will be replaced by this token

encoder = tfds.deprecated.text.TokenTextEncoder(
    vocabulary, oov_token='<UNK>', lowercase=True, tokenizer=tokenizer
)

In [None]:
# encoding function
def my_encoding(text_tensor, label):
  # convert tensor to string
  text = text_tensor.numpy().decode('utf-8') # convert tenspr to string
  # encode the text using encoder
  encoded_text = encoder.encode(text)
  return encoded_text, label

In [None]:
 # wrap the function with a `tf.py_function`
 # This allows TensorFlow to call the Python function within its data pipeline.
def tf_my_encoding(text, label):
  encoded_text, label = tf.py_function(
      my_encoding, # python function to execute
      inp=[text, label], # expected inputs to the function
      Tout=(tf.int64, label.dtype) # expected output types
  )
  print("Encoded Text Shape:", encoded_text.shape)
  print("Label Shape:", label.shape)
  encoded_text.set_shape([None])
  label.set_shape(label, [])

  return encoded_text, label

- `num_parallel_calls=AUTOTUNE` allows TensorFlow to automatically choose the number of CPU cores to use for parallel processing, optimizing performance.

```python
encoded_ds_train = encoded_ds_train.padded_batch(32, padded_shapes=([None], ()))
```
- This creates batches of size 32, where each batch is padded to the maximum length of the sequences in that batch.
- `padded_shapes=([None], ())` specifies that the input sequences can have variable lengths (indicated by None), and the labels are scalar values (indicated by ()).
- Padding sequences ensures that all sequences within a batch have the same length, which is necessary for efficient batch processing in deep learning models.

In [None]:
# apply to the dataset
AUTOTUNE = tf.data.experimental.AUTOTUNE

encoded_ds_train = ds_train.map(tf_my_encoding, num_parallel_calls=AUTOTUNE).cache()
encoded_ds_train = encoded_ds_train.shuffle(10000)
encoded_ds_train = encoded_ds_train.padded_batch(32, padded_shapes=([None], ()))
encoded_ds_train = encoded_ds_train.prefetch(AUTOTUNE)

In [None]:
encoded_ds_test = ds_test.map(tf_my_encoding)
encoded_ds_test = ds_test.padded_batch(32, padded_shapes=([None], []))

Encoded Text Shape: <unknown>
Label Shape: <unknown>


ValueError: The padded shape (None,) is not compatible with the shape () of the corresponding input component.

**Model Architecture**

The model consists of the following layers:

1. **Masking Layer:**

**Purpose:** Ignores padded values during computation. This layer is useful when dealing with sequences of variable lengths and ensures that the model doesn't process padded values, which could negatively affect the model's predictions.

**Arguments:**

`mask_value=0`: Specifies the value to be treated as a mask. In this case, it's set to 0, assuming that 0 represents padding in the input sequences.


2. **Embedding Layer:**

**Purpose:** Converts integer-encoded words into dense vectors of fixed size (output_dim=32).

**Arguments:**

`input_dim=len(vocabulary) + 2`: Specifies the size of the vocabulary plus two additional tokens. The additional tokens might be reserved for out-of-vocabulary words and masked values.

`output_dim=32`: Specifies the dimensionality of the dense embedding vectors.


3. **GlobalAveragePooling1D Layer:**

**Purpose:** Averages the embedding vectors across the time dimension, effectively reducing the sequence length to a single vector.

**Explanation:** This layer helps in reducing the model's complexity and mitigates the risk of overfitting by summarizing the information across the entire sequence.


4. **Dense Layer (ReLU Activation):**

**Purpose:** Applies a dense layer with ReLU activation to introduce non-linearity and increase the model's capacity to learn complex patterns in the data.

**Arguments:**

`units=64`: Specifies the number of neurons in the layer.

`activation='relu'`: Specifies the rectified linear unit (ReLU) activation function.


5. **Dense Layer (Linear Activation):**

**Purpose:** Outputs a single value, representing the model's prediction.

**Explanation:** Since this is a binary classification problem, a single neuron with a linear activation function is used to produce a continuous output. The output can be interpreted as the model's confidence score or probability for the positive class.

In [None]:
# create the model
model = keras.Sequential([
    # ingnore the 0s added during padding
    layers.Masking(mask_value=0),
    #
    layers.Embedding(input_dim=len(vocabulary)+2, output_dim=32),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1) # less the 0: negative, greater or equal than 0: positive
])

In [None]:
model.compile(
  loss = keras.losses.BinaryCrossentropy(from_logits=True),
  optimizer = keras.optimizers.Adam(3e-4, clipnorm=1),
  metrics = ['accuracy'],
)

In [None]:
model.fit(encoded_ds_train, epochs=10, verbose=2)
model.evaluate(encoded_ds_test)