## tensorflow pipelines

when we are training a model ,our dataset is on hard disk, we have to convert it to number then process it on RAM

it is fine when we have small dataset, but when we have millions of data and small storage on RAM , there is too much data for RAM to handle it

streaming approach : we feed data as several batches into RAM with special data structure(tf.data.Dataset) and train our model for each batch

if we want to clean our data we can use filter function

we can use map function to scale image data by dividing it by 255

then fit our model

benefits of tf pipelines:
1. handle huge datasets by streaming them from disk using batching
2. apply transformations to make dataset ready for model training

In [27]:
import tensorflow as tf

In [28]:
daily_sales_numbers = [21, 22, -108, 31, -1, 32, 34,31] # we have errors in our data(negative values)

In [29]:
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers) # create tf dataset
tf_dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [30]:
for sales in tf_dataset:
    print(sales.numpy()) # it is tensor object and we are converting it to numpy object

21
22
-108
31
-1
32
34
31


In [31]:
for sales in tf_dataset.as_numpy_iterator(): # we can do this to iterate it as numpy object
    print(sales)

21
22
-108
31
-1
32
34
31


In [32]:
for sales in tf_dataset.take(3).as_numpy_iterator(): # it will take first 3 elements
    print(sales)

21
22
-108


In [33]:
tf_dataset = tf_dataset.filter(lambda x: x>0) # filtering error data points

In [34]:
# these numbers are in US dollars, and we want to convert it into IR tooman

tf_dataset = tf_dataset.map(lambda x: x*60000)

In [35]:
# shuffle the elements of dataset

tf_dataset = tf_dataset.shuffle(buffer_size=7) # for perfect shuffle buffer size have to be greater than dataset size

In [36]:
# batching:

for sales_batch in tf_dataset.batch(2).as_numpy_iterator(): # batch size is 2
    print(sales_batch)

[1920000 1860000]
[1320000 2040000]
[1260000 1860000]


In [37]:
# we can do all this functions in one single line:

tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)

tf_dataset = tf_dataset.filter(lambda x: x>0).map(lambda y: y*60000).shuffle(2).batch(2)

for sales in tf_dataset.as_numpy_iterator():
    print(sales)

[1260000 1860000]
[1920000 2040000]
[1320000 1860000]


In [38]:
# go to our dataset directory and read the images
images_ds = tf.data.Dataset.list_files('.\\datasets\\tensorflow_input_pipeline\\*\\*', shuffle=False)

In [39]:
for file in images_ds.take(3):
    print(file.numpy())

b'.\\datasets\\tensorflow_input_pipeline\\cat\\20 Reasons Why Cats Make the Best Pets....jpg'
b'.\\datasets\\tensorflow_input_pipeline\\cat\\7 Foods Your Cat Can_t Eat.jpg'
b'.\\datasets\\tensorflow_input_pipeline\\cat\\A cat appears to have caught the....jpg'


In [40]:
images_ds = images_ds.shuffle(200)

for file in images_ds.take(3):
    print(file.numpy())

b'.\\datasets\\tensorflow_input_pipeline\\dog\\45 Best Large Dog Breeds - Top Big Dogs_yyth....jpg'
b'.\\datasets\\tensorflow_input_pipeline\\dog\\why dogs understand our body language....jpg'
b'.\\datasets\\tensorflow_input_pipeline\\dog\\Rottweiler Dog Breed Information....jpg'


In [41]:
class_names = ['cat', 'dog']

In [42]:
len(images_ds)

121

In [43]:
train_size = int(len(images_ds)*0.8)

In [44]:
train_ds = images_ds.take(train_size)

test_ds = images_ds.skip(train_size)  # skip function will skip n number of samples and take the remaining

In [45]:
# from image path we can take label
import os
def get_label(file_path):
    return tf.strings.split(file_path, os.path.sep)[-2]

In [46]:
for label in train_ds.map(get_label):
    print(label)

tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=

In [50]:
# function that returns our x_train and y_train
def process_image(file_path):
    label = get_label(file_path)
    img = tf.io.read_file(file_path) # this will read our files
    img = tf.image.decode_jpeg(img)
    img = tf.image.resize(img, [128,128])

    return img, label

In [51]:
for img, label in train_ds.map(process_image).take(3):
    print(img)
    print(label)

tf.Tensor(
[[[ 24.         19.         15.       ]
  [ 24.         19.         15.       ]
  [ 21.863281   18.863281   13.863281 ]
  ...
  [143.92188   142.92188   148.92188  ]
  [153.92847   151.08228   158.00537  ]
  [104.69971    99.51221    97.07471  ]]

 [[ 22.076904   17.076904   13.076904 ]
  [ 24.410156   19.410156   15.410156 ]
  [ 18.07373    15.07373    10.07373  ]
  ...
  [144.7373    143.7373    149.7373   ]
  [150.03638   154.59888   157.59888  ]
  [123.66211   117.66211   120.03711  ]]

 [[ 22.1875     17.1875     13.1875   ]
  [ 21.615479   16.615479   12.6154785]
  [ 20.         17.         12.       ]
  ...
  [147.12085   146.12085   152.12085  ]
  [147.23486   148.23486   152.23486  ]
  [152.54517   150.54517   155.031    ]]

 ...

 [[138.17798   143.17798   146.17798  ]
  [144.43701   149.1206    152.1206   ]
  [163.23169   169.18091   172.8645   ]
  ...
  [128.80176   132.43457   137.11816  ]
  [153.0188    150.84082   161.84082  ]
  [100.593994  102.22681   107.85

In [52]:
# now we have to scale x_train

def scale(image, label):
    return image/255, label

In [53]:
train_ds = train_ds.map(process_image)
test_ds = test_ds.map(process_image)

In [54]:
train_ds = train_ds.map(scale)
test_ds = test_ds.map(scale)

In [55]:
for image, label in train_ds.take(5):
    print(">>>image: ",image.numpy()[0][0])
    print(">>>label: ",label.numpy())

>>>image:  [0.3987501 0.5242403 0.6222795]
>>>label:  b'cat'
>>>image:  [0.63529414 0.6392157  0.6156863 ]
>>>label:  b'dog'
>>>image:  [0.3577304 0.6239124 0.4327638]
>>>label:  b'dog'
>>>image:  [0.8718137  0.91495097 0.9227941 ]
>>>label:  b'dog'
>>>image:  [0.03924632 0.07846201 0.04316789]
>>>label:  b'cat'


In [56]:
# we can merge all functions into one