In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## `tf.data.Dataset.prefetch()`

### 1. Introduction
In TensorFlow’s **`tf.data` API**, the `prefetch()` transformation is used to **overlap the preprocessing and model execution** of data.

Normally, data loading and model training happen **sequentially**:
1. Load data →  
2. Preprocess data →  
3. Feed to model →  
4. Train →  
(repeat)

With `prefetch()`, TensorFlow starts preparing the **next batch of data while the current batch is being processed by the model**, allowing computation and data loading to run **in parallel**.  

This helps **reduce training bottlenecks** caused by slow data input pipelines.


### 2. Function Syntax

```python
tf.data.Dataset.prefetch(buffer_size)


In [2]:
import tensorflow as tf

# Create a simple dataset
dataset = tf.data.Dataset.range(10)

# Batch and prefetch
dataset = dataset.batch(2).prefetch(tf.data.AUTOTUNE)

for batch in dataset:
    print(batch.numpy())

[0 1]
[2 3]
[4 5]
[6 7]
[8 9]


Without prefetch():
Load → Train → Load → Train → Load → Train

With prefetch():
Load + Train overlap → Faster pipeline


In [None]:
# typical usage in training pipeline

train_dataset = (
    tf.data.Dataset.from_tensor_slices((images, labels))
    .shuffle(buffer_size=1000)
    .batch(32)
    .prefetch(tf.data.AUTOTUNE)
)


### 6. Performance Benefits

| Without Prefetch                              | With Prefetch                     |
| --------------------------------------------- | --------------------------------- |
| Data loading and training happen sequentially | Data loading and training overlap |
| GPU often idle waiting for data               | GPU remains busy                  |
| Slower overall training                       | Faster, smoother training         |

In large models or heavy input pipelines (e.g., reading images, augmentations), this can significantly **reduce training time**.

---

### 7. Summary

| Concept            | Description                                                                     |
| ------------------ | ------------------------------------------------------------------------------- |
| **Purpose**        | To overlap data preprocessing and model execution                               |
| **How it works**   | Prefetches batches in the background while the current batch is being processed |
| **Common setting** | `dataset.prefetch(tf.data.AUTOTUNE)`                                            |
| **Effect**         | Reduces input latency and improves GPU utilization                              |
| **Use case**       | Always the last step in a `tf.data` pipeline before feeding data into the model |

---

### 8. Best Practice

When building any TensorFlow input pipeline:

1. Apply transformations like `map()`, `shuffle()`, and `batch()`.
2. Always end the pipeline with `.prefetch(tf.data.AUTOTUNE)`.

## `tf.data.Dataset.cache()`

### 1. Introduction
In TensorFlow’s `tf.data` API, the **`cache()`** transformation is used to **store (cache) the dataset in memory or on disk** after the first time it’s loaded.  

This makes subsequent epochs or iterations **much faster**, since the data doesn’t need to be **re-read** or **re-preprocessed** each time.

It’s especially useful when:
- You’re repeatedly iterating over the same dataset (e.g., during multiple training epochs).
- The dataset fits into memory (RAM) or can be cached to disk efficiently.


### 2. Function Syntax

```python
tf.data.Dataset.cache(filename='')


In [4]:
import tensorflow as tf
import time

# Create a simple dataset
dataset = tf.data.Dataset.range(5)

# Simulate a slow preprocessing step
dataset = dataset.map(lambda x: tf.py_function(lambda y: time.sleep(0.5) or y, [x], tf.int64))

# Cache the dataset
dataset = dataset.cache()

# Iterate twice (simulating 2 epochs)
for epoch in range(2):
    start = time.time()
    for item in dataset:
        print(f"Epoch {epoch+1}: Item {item.numpy()}")
    print("Time:", round(time.time() - start, 2), "seconds\n")


Epoch 1: Item 0
Epoch 1: Item 1
Epoch 1: Item 2
Epoch 1: Item 3
Epoch 1: Item 4
Time: 2.53 seconds

Epoch 2: Item 0
Epoch 2: Item 1
Epoch 2: Item 2
Epoch 2: Item 3
Epoch 2: Item 4
Time: 0.0 seconds



In [None]:
# Use Case: Caching in Training Pipelines

# A common usage pattern:

train_dataset = (
    tf.data.Dataset.from_tensor_slices((images, labels))
    .map(preprocess_function)
    .cache()  # Cache after preprocessing
    .shuffle(1000)
    .batch(32)
    .prefetch(tf.data.AUTOTUNE)
)
