In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [6]:
daily_sales_number = [21, 22, -108, 31, -1, 32, 34, 31]

In [7]:
import tensorflow as tf

tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_number)
# this data is encoded in object

In [14]:
print("Method 2:")
for sales in tf_dataset:
    print(sales.numpy())

# this encoded object can be viewed with the help of numpy array
# OR
print("method 2:")
for sales in tf_dataset.as_numpy_iterator():
    print(sales)


Method 2:
21
22
-108
31
-1
32
34
31
method 2:
21
22
-108
31
-1
32
34
31


In [17]:
for sales in tf_dataset.take(3):
    print(sales.numpy())

# using take(), we can take how much data we want from the object
# works like sample for dataframe

21
22
-108


In [22]:
# Sales number can't be negetive, we need to filter the values 
tf_dataset = tf_dataset.filter(lambda x: x>0)

for sales in tf_dataset:
    print(sales.numpy())

# you can see the negetive samples are gone!!! booom

21
22
31
32
34
31


In [25]:
# these numbers are in us dollars, to convert them to indian rupees 
# we can use map() there

tf_dataset = tf_dataset.map(lambda x: x*72) # 1 dollars = 72 rupees

for sales in tf_dataset:
    print(sales.numpy())


1512
1584
2232
2304
2448
2232


In [26]:
# if you want to randomly shuffle these elemnts for training purpose

tf_dataset = tf_dataset.shuffle(2)

for sales in tf_dataset:
    print(sales.numpy())

# randomly re-arranged elemnts

1512
2232
2304
1584
2448
2232


In [28]:
# If you want to create batches of data
print("before batching:")
for sales_batch in tf_dataset:
    print(sales_batch.numpy())


print("after batching:")
for sales_batch in tf_dataset.batch(2):
    print(sales_batch.numpy())

before batching:
1512
1584
2232
2448
2232
2304
after batching:
[1512 1584]
[2304 2448]
[2232 2232]


In [33]:
# these can be done in single line

tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_number)

tf_dataset = tf_dataset.filter(lambda x: x>0).map(lambda y: y*72).shuffle(2).batch(2)

for sales in tf_dataset:
    print(sales.numpy())


[1584 2232]
[1512 2304]
[2232 2448]


In [None]:
images_ds = tf.data.Dataset.list_files("/images", shuffle=False)


In [None]:
image_count = len(images_ds)
image_count

In [None]:
type(images_ds)

In [None]:
for file in images_ds.take(3):
    print(file.numpy())

In [None]:
images_ds = images_ds.shuffle(200)
for file in images_ds.take(3):
    print(file.numpy())

In [None]:
class_names = ["cat","dog"]

In [None]:
train_size = int(image_count*0.8)
train_ds = images_ds.take(train_size)
test_ds = images_ds.skip(train_size)

In [None]:

len(train_ds)

In [None]:
def get_label(file_path):
    import os
    parts = tf.strings.split(file_path, os.path.sep)
    return parts[-2]

In [None]:
get_label("images\\dog\\20 Reasons Why Cats Make the Best Pets....jpg")

In [None]:
def process_image(file_path):
    label = get_label(file_path)
    img = tf.io.read_file(file_path) # load the raw data from the file as a string
    img = tf.image.decode_jpeg(img)
    img = tf.image.resize(img, [128, 128])
    return img, label

In [None]:
img, label = process_image("images\\cat\\20 Reasons Why Cats Make the Best Pets....jpg")
img.numpy()[:2]

In [None]:
train_ds = train_ds.map(process_image)
test_ds = test_ds.map(process_image)

In [None]:
for image, label in train_ds.take(1):
    print("****",image)
    print("****",label)

In [None]:
def scale(image, label):
    return image/255, label

In [None]:
train_ds = train_ds.map(scale)

In [None]:
for image, label in train_ds.take(5):
    print("****Image: ",image.numpy()[0][0])
    print("****Label: ",label.numpy())

## **Explanation of Image Dataset Pipeline**

The second part of the code focuses on using the `tf.data` API to efficiently load and preprocess a large collection of images, which is essential for deep learning.

### 1\. Data Ingestion and Splitting

| Code/Concept | Explanation | Rationale |
| :--- | :--- | :--- |
| `images_ds = tf.data.Dataset.list_files('images/*/*', shuffle=False)` | **Listing Files** | Creates a dataset where each element is a **file path string** (a TensorFlow `string` tensor). The `*/*` pattern searches for files within immediate subdirectories (e.g., `cat/` and `dog/`) inside the `images/` directory. | It avoids loading all image data into memory at once, consuming only the file paths. This is the **starting point** for efficient, scalable image loading. |
| `images_ds = images_ds.shuffle(200)` | **Shuffling** | Randomly shuffles the order of the file paths. The argument `200` is the buffer size. | Ensures that batches drawn during training are **not sequential** (e.g., all "cat" images followed by all "dog" images), which is crucial for **preventing bias** and aiding generalization. |
| `train_ds = images_ds.take(train_size)`<br>`test_ds = images_ds.skip(train_size)` | **Splitting** | Splits the shuffled file path dataset into training (80%) and testing (20%) datasets. | Standard practice in machine learning to evaluate the model's performance on **unseen data** (`test_ds`) after it has been trained on the majority of the data (`train_ds`). |



### 2\. Preprocessing Functions

These Python functions define the *transformations* that will be applied to every element (file path) in the dataset.

| Function | Concept | Explanation |
| :--- | :--- | :--- |
| `get_label(file_path)` | **Extracting Label** | Splits the file path string based on the path separator (`os.path.sep`) and returns the **second-to-last part**. For a path like `images/dog/file.jpg`, this extracts the directory name: **`dog`**. | This leverages the common practice of organizing image datasets by putting images for each class into a directory named after that class. |
| `process_image(file_path)` | **Image Loading & Decoding** | 1. Calls `get_label` to get the class name. 2. `tf.io.read_file`: Reads the raw bytes of the image file. 3. `tf.image.decode_jpeg`: Decodes the raw bytes into a full-color tensor. 4. `tf.image.resize`: Resizes the image to a fixed target size of $128 \times 128$. | This is the heavy lifting: it converts a simple file path into a usable, consistent tensor format. Resizing is essential because deep learning models require **fixed-size inputs**. |



### 3\. Dataset Mapping and Transformation

The following steps apply the preprocessing functions to the entire dataset using the `map` function.

| Code/Concept | Explanation | Rationale |
| :--- | :--- | :--- |
| `train_ds = train_ds.map(process_image)`<br>`test_ds = test_ds.map(process_image)` | **Mapping (Applying `process_image`)** | The `map` function applies the `process_image` function (which returns `(image_tensor, label_tensor)`) to every file path in the training and testing datasets. | This is an **efficient parallel operation**. TensorFlow handles the heavy I/O and CPU work (reading and resizing) in the background, preparing the data for the GPU/TPU training step. |
| `def scale(image, label):`<br>`return image/255, label` | **Scaling Function** | Defines a function to normalize the pixel values from the standard $0-255$ range to the $0.0-1.0$ range. | **Normalization** is critical for model stability and convergence. Scaling inputs to a small, consistent range ensures that gradients are well-behaved during backpropagation. |
| `train_ds = train_ds.map(scale)` | **Applying Scaling** | Applies the `scale` function to the output of the `process_image` function. The dataset now yields normalized image tensors and label strings. | Completes the standard preprocessing steps, making the image data ready for a neural network. |



### 4\. Final Preparation (Missing but Necessary)

Although the code stops at the `scale` step, a full machine learning pipeline would require these final two steps:

1.  **Label Encoding:** The labels (`b'cat'`, `b'dog'`) need to be converted to numerical one-hot vectors (e.g., $[1, 0]$ for cat, $[0, 1]$ for dog).
2.  **Batching and Prefetching:** The dataset must be grouped into batches for model training, and prefetching should be enabled to overlap data loading with model computation.

This setup showcases a powerful, idiomatic way to handle large datasets in TensorFlow by leveraging the **lazy evaluation** and **parallelism** of the `tf.data.Dataset` API.