<a href="https://colab.research.google.com/github/shoabahamed/codebasics-deep-learning-course/blob/main/deep_learning_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import os
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
tf.__version__

'2.8.2'

In [6]:
text_ds = tf.data.Dataset.list_files('/content/drive/MyDrive/text_files/*/*', shuffle=False)
for file in text_ds:
    print(file.numpy())

b'/content/drive/MyDrive/text_files/negative/neg0.txt'
b'/content/drive/MyDrive/text_files/negative/neg1.txt'
b'/content/drive/MyDrive/text_files/negative/neg2.txt'
b'/content/drive/MyDrive/text_files/positive/pos0.txt'
b'/content/drive/MyDrive/text_files/positive/pos1.txt'
b'/content/drive/MyDrive/text_files/positive/pos2.txt'


In [7]:
def get_label(file_path):
    text_name = tf.strings.split(file_path, os.path.sep)[-2]
    return text_name


for i in text_ds.map(get_label):
    print(i.numpy())

b'negative'
b'negative'
b'negative'
b'positive'
b'positive'
b'positive'


In [8]:
def text_processed(file_path):
    txt_name = get_label(file_path)
    text = tf.io.read_file(file_path)
    return txt_name, text

new_ds = text_ds.map(text_processed)

In [9]:
new_text_ds = new_ds.filter(lambda label, review: review != '')

for label, text in new_text_ds:
    print(text)
    print(label)

tf.Tensor(b'worst', shape=(), dtype=string)
tf.Tensor(b'negative', shape=(), dtype=string)
tf.Tensor(b'bad', shape=(), dtype=string)
tf.Tensor(b'negative', shape=(), dtype=string)
tf.Tensor(b'nice', shape=(), dtype=string)
tf.Tensor(b'positive', shape=(), dtype=string)
tf.Tensor(b'good', shape=(), dtype=string)
tf.Tensor(b'positive', shape=(), dtype=string)


In [10]:
t_ds = text_ds.map(text_processed).filter(lambda label, review: review != '').shuffle(3)

for label, text in t_ds:
    print(f"Text: {text}")
    print(f'Label: {label}')

Text: b'bad'
Label: b'negative'
Text: b'worst'
Label: b'negative'
Text: b'nice'
Label: b'positive'
Text: b'good'
Label: b'positive'


<h1>PIPELINE PERFORMANCE</h1>

<h2>Prefecth</h2>

In [13]:
# first let's create a class which behaves the same way when we open files from the disk

class FileDataset(tf.data.Dataset):
    
    def read_files_in_batches(num_samples):
#         open file
        time.sleep(0.03)
        for sample_idx in range(num_samples):
        # Reading data (line, record) from the file
            time.sleep(0.015)
            
            yield (sample_idx, )
            
    def __new__(cls, num_samples=3):
#         whenever a new object is created this method will run
        return tf.data.Dataset.from_generator(
            cls.read_files_in_batches,
            output_signature = tf.TensorSpec(shape=(1, ), dtype=tf.int64),
            args=(num_samples,)
        )
        

In [11]:
def benchmark(dataset, num_epochs=2):
    for epoch_num in range(num_epochs):
        for sample in dataset:
            # training step
            time.sleep(0.01)

In [14]:
%%timeit
benchmark(FileDataset())

1 loop, best of 5: 262 ms per loop


In [15]:
%%timeit
benchmark(FileDataset().prefetch(1))

1 loop, best of 5: 258 ms per loop


In [47]:
%%timeit
benchmark(FileDataset().prefetch(tf.data.AUTOTUNE))

1 loop, best of 5: 222 ms per loop


<h2>Cache</h2>

In [49]:
dataset = tf.data.Dataset.range(5)
list(dataset.as_numpy_iterator())

[0, 1, 2, 3, 4]

In [50]:
dataset = dataset.map(lambda x: x**2)
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [53]:
def mapped_function(s):
    tf.py_function(lambda: time.sleep(0.03), [], ())
    return s

In [56]:
%%timeit -n1 -r1
benchmark(FileDataset().map(mapped_function), 5)

1 loop, best of 1: 1.23 s per loop


In [57]:
%%timeit -n1 -r1
benchmark(FileDataset().map(mapped_function).cache(), 5)

1 loop, best of 1: 388 ms per loop


THERE WAS A HUGE IMPROVEMENT IN RUNNING TIME AS CATCHE SAVES THE PREVIOUSLY LOADED DATA