# Tensorflow input data pipeline

## On simple lists

In [1]:
import tensorflow as tf

In [2]:
daily_sales_numbers = [21, 22, -108, 31, -1, 32, 34,31]


### Create tf dataset

In [3]:
# Tensorflow.data is an API that introduces an abstraction, Tensorflow dataset which is a special datastructure that stores data loaded from disk, on which model needs to be trained.
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)

In [4]:
for dataset in tf_dataset:
    print(dataset)

tf.Tensor(21, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(-108, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)
tf.Tensor(-1, shape=(), dtype=int32)
tf.Tensor(32, shape=(), dtype=int32)
tf.Tensor(34, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)


In [5]:
for dataset in tf_dataset.as_numpy_iterator():
    print(dataset)

21
22
-108
31
-1
32
34
31


### Create filter 

In [6]:
tf_dataset = tf_dataset.filter(lambda x: x>0)

In [7]:
for dataset in tf_dataset.as_numpy_iterator():
    print(dataset)

21
22
31
32
34
31


### Create transformation

In [8]:
tf_dataset = tf_dataset.map(lambda x: x*75)

In [9]:
for dataset in tf_dataset.as_numpy_iterator():
    print(dataset)

1575
1650
2325
2400
2550
2325


### Another Operation - Shuffle

In [10]:
tf_dataset = tf_dataset.shuffle(3)

In [11]:
for dataset in tf_dataset.as_numpy_iterator():
    print(dataset)

1650
2325
1575
2400
2550
2325


### Batching

In [12]:
tf_dataset = tf_dataset.batch(2)

In [13]:
for dataset in tf_dataset.as_numpy_iterator():
    print(dataset)

[1650 2325]
[2400 2325]
[2550 1575]


## One liner

In [14]:
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers).filter(lambda x: x>0).map(lambda x: x*75).shuffle(3).batch(2)

In [15]:
for dataset in tf_dataset.as_numpy_iterator():
    print(dataset)

[1575 2325]
[1650 2325]
[2550 2400]


## On Images
We have bulk data, the input pipeline should be able to load the data and its corresponding labels and split the data into train and test set. Then it should read and decode the image into pixel matrix. Then it should be scaled in the range of 0 to 1. 
Data in this form can be consumed for training, hence called input data pipeline.

### Get data

In [102]:
dataset_url=  "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file('flower_photos', origin=dataset_url,  cache_dir='.', untar=True)
# cache_dir indicates where to download data. I specified . which means current directory
# untar true will unzip it

In [103]:
data_dir

'.\\datasets\\flower_photos'

### Get Images

In [113]:
tf_dataset = tf.data.Dataset.list_files('.\\datasets\\flower_photos\\*\\*', shuffle = False)

### Convert paths from binary to string

In [114]:
def show_data(tf_dataset):
    for dataset in tf_dataset.as_numpy_iterator():
        print(dataset)

In [115]:
show_data(tf_dataset)

b'.\\datasets\\flower_photos\\daisy\\100080576_f52e8ee070_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\10140303196_b88d3d6cec.jpg'
b'.\\datasets\\flower_photos\\daisy\\10172379554_b296050f82_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\10172567486_2748826a8b.jpg'
b'.\\datasets\\flower_photos\\daisy\\10172636503_21bededa75_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\102841525_bd6628ae3c.jpg'
b'.\\datasets\\flower_photos\\daisy\\1031799732_e7f4008c03.jpg'
b'.\\datasets\\flower_photos\\daisy\\10391248763_1d16681106_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\10437754174_22ec990b77_m.jpg'
b'.\\datasets\\flower_photos\\daisy\\10437770546_8bb6f7bdd3_m.jpg'
b'.\\datasets\\flower_photos\\daisy\\10437929963_bc13eebe0c.jpg'
b'.\\datasets\\flower_photos\\daisy\\10466290366_cc72e33532.jpg'
b'.\\datasets\\flower_photos\\daisy\\10466558316_a7198b87e2.jpg'
b'.\\datasets\\flower_photos\\daisy\\10555749515_13a12a026e.jpg'
b'.\\datasets\\flower_photos\\daisy\\10555815624_dc211569b0.jpg'
b'.\\datasets\\flo

### Get Binary class label for given input image path

In [158]:
import os
def get_label(path):
    parts = tf.strings.split(path, sep=os.path.sep)
    return parts[-2]

In [203]:
get_label('b\'.\\datasets\\flower_photos\\dandelion\\177851662_b2622b4238_n.jpg')

<tf.Tensor: shape=(), dtype=string, numpy=b'dandelion'>

In [50]:
img_count = len(tf_dataset)

In [51]:
img_count

3670

In [52]:
train_size = int(img_count*0.8)

In [53]:
train_ds = tf_dataset.take(train_size)

In [54]:
len(train_ds)

2936

In [55]:
test_ds = tf_dataset.skip(train_size)

In [120]:
len(test_ds)

734

In [119]:
show_data(train_ds)

b'.\\datasets\\flower_photos\\daisy\\100080576_f52e8ee070_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\10140303196_b88d3d6cec.jpg'
b'.\\datasets\\flower_photos\\daisy\\10172379554_b296050f82_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\10172567486_2748826a8b.jpg'
b'.\\datasets\\flower_photos\\daisy\\10172636503_21bededa75_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\102841525_bd6628ae3c.jpg'
b'.\\datasets\\flower_photos\\daisy\\1031799732_e7f4008c03.jpg'
b'.\\datasets\\flower_photos\\daisy\\10391248763_1d16681106_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\10437754174_22ec990b77_m.jpg'
b'.\\datasets\\flower_photos\\daisy\\10437770546_8bb6f7bdd3_m.jpg'
b'.\\datasets\\flower_photos\\daisy\\10437929963_bc13eebe0c.jpg'
b'.\\datasets\\flower_photos\\daisy\\10466290366_cc72e33532.jpg'
b'.\\datasets\\flower_photos\\daisy\\10466558316_a7198b87e2.jpg'
b'.\\datasets\\flower_photos\\daisy\\10555749515_13a12a026e.jpg'
b'.\\datasets\\flower_photos\\daisy\\10555815624_dc211569b0.jpg'
b'.\\datasets\\flo

### Get training set

In [65]:
encoding = 'utf-8'
def get_training_set():
    img_count = len(tf_dataset)
    train_ds_b = tf_dataset.take(img_count*0.8)
    train_ds = [str(tds, encoding) for tds in train_ds_b]
    return train_ds
        
    

In [66]:
show_data(train_ds)

b'.\\datasets\\flower_photos\\daisy\\100080576_f52e8ee070_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\10140303196_b88d3d6cec.jpg'
b'.\\datasets\\flower_photos\\daisy\\10172379554_b296050f82_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\10172567486_2748826a8b.jpg'
b'.\\datasets\\flower_photos\\daisy\\10172636503_21bededa75_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\102841525_bd6628ae3c.jpg'
b'.\\datasets\\flower_photos\\daisy\\1031799732_e7f4008c03.jpg'
b'.\\datasets\\flower_photos\\daisy\\10391248763_1d16681106_n.jpg'
b'.\\datasets\\flower_photos\\daisy\\10437754174_22ec990b77_m.jpg'
b'.\\datasets\\flower_photos\\daisy\\10437770546_8bb6f7bdd3_m.jpg'
b'.\\datasets\\flower_photos\\daisy\\10437929963_bc13eebe0c.jpg'
b'.\\datasets\\flower_photos\\daisy\\10466290366_cc72e33532.jpg'
b'.\\datasets\\flower_photos\\daisy\\10466558316_a7198b87e2.jpg'
b'.\\datasets\\flower_photos\\daisy\\10555749515_13a12a026e.jpg'
b'.\\datasets\\flower_photos\\daisy\\10555815624_dc211569b0.jpg'
b'.\\datasets\\flo

### Process image

In [148]:
#img_path = "b'.\\datasets\\flower_photos\\daisy\\14147016029_8d3cf2414e.jpg"

def process_image(file_path):
    label = get_label(file_path)
    img = tf.io.read_file(file_path) # load the raw data from the file as a string
    img = tf.image.decode_jpeg(img)
    img = tf.image.resize(img, [128, 128])
    return img, label

In [149]:
img, label = process_image(".\\datasets\\flower_photos\\dandelion\\177851662_b2622b4238_n.jpg")

In [151]:
label

'dandelion'

In [152]:
print(f'image : {img.numpy()[:2]}, label : {label}')

image : [[[ 2.        2.        2.      ]
  [ 2.4375    2.4375    2.4375  ]
  [ 2.4375    2.4375    2.4375  ]
  [ 2.4375    2.4375    2.4375  ]
  [ 2.        2.        2.      ]
  [ 0.859375  0.859375  0.859375]
  [ 0.53125   0.53125   0.53125 ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.328125  0.328125  0.328125]
  [ 0.        0.        0.      ]
  [ 1.        1.        1.      ]
  [ 2.        2.        2.      ]
  [ 2.671875  2.671875  2.671875]
  [ 2.578125  2.578125  2.578125]
  [ 0.875     0.875     0.875   ]
  [ 0.828125  0.828125  0.828125]
  [ 1.859375  1.859375  1.859375]
  [ 0.328125  0.328125  0.328125]
  [ 2.609375  2.609375  2.609375]
  [ 5.4375    5.4375    5.4375  ]
  [ 6.328125  6.328125  6.328125]
  [ 3.578125  3.578125  3.578125]
  [ 2.859375  2.859375  2.859375]
  [ 3.328125  3.328125  3.328125]
  [ 4.828125  4.828125  4.828125]
  [ 5.        5.        5.      ]
  [ 6.140625  6.140625  6.140625]
  [ 6.        6.        6.      ]
  [ 6.

### Scale

In [154]:
def scale(image, label):
    return image/255, label

In [156]:
scale(img, label)

(<tf.Tensor: shape=(128, 128, 3), dtype=float32, numpy=
 array([[[0.00784314, 0.00784314, 0.00784314],
         [0.00955882, 0.00955882, 0.00955882],
         [0.00955882, 0.00955882, 0.00955882],
         ...,
         [0.0127451 , 0.0127451 , 0.0127451 ],
         [0.02126225, 0.02126225, 0.02126225],
         [0.01960784, 0.01960784, 0.01960784]],
 
        [[0.01268382, 0.01268382, 0.01268382],
         [0.01789216, 0.01789216, 0.01789216],
         [0.02083333, 0.02083333, 0.02083333],
         ...,
         [0.0127451 , 0.0127451 , 0.0127451 ],
         [0.01862745, 0.01862745, 0.01862745],
         [0.01960784, 0.01960784, 0.01960784]],
 
        [[0.01415441, 0.01415441, 0.01415441],
         [0.02040441, 0.02040441, 0.02040441],
         [0.02671569, 0.02671569, 0.02671569],
         ...,
         [0.01256127, 0.01256127, 0.01256127],
         [0.01807598, 0.01807598, 0.01807598],
         [0.02040441, 0.02040441, 0.02040441]],
 
        ...,
 
        [[0.01102941, 0.01102941

## One Liner

In [169]:
tf_dataset = tf.data.Dataset.list_files('.\\datasets\\flower_photos\\*\\*', shuffle = False).map(lambda x: process_image(x)).map(lambda x,y: scale(x,y))

In [170]:
show_data(tf_dataset)

(array([[[0.53039217, 0.5382353 , 0.5264706 ],
        [0.5622281 , 0.5700712 , 0.5592333 ],
        [0.5763634 , 0.5842065 , 0.58028495],
        ...,
        [0.5991268 , 0.6030484 , 0.58344054],
        [0.6009804 , 0.6009804 , 0.59313726],
        [0.6004634 , 0.6004634 , 0.59262025]],

       [[0.5264706 , 0.53203124, 0.514882  ],
        [0.5605354 , 0.56381357, 0.55433136],
        [0.5732805 , 0.57655865, 0.57035464],
        ...,
        [0.6019608 , 0.60588235, 0.5862745 ],
        [0.59698606, 0.59698606, 0.5891429 ],
        [0.59037226, 0.59037226, 0.5825291 ]],

       [[0.5200521 , 0.52397364, 0.5043658 ],
        [0.5509766 , 0.5534735 , 0.5381396 ],
        [0.5655944 , 0.5655944 , 0.55775124],
        ...,
        [0.6138174 , 0.61773896, 0.5981311 ],
        [0.5947419 , 0.5947419 , 0.58689874],
        [0.5788756 , 0.5788756 , 0.57103246]],

       ...,

       [[0.16898361, 0.18466988, 0.0944738 ],
        [0.16613051, 0.18181679, 0.09162071],
        [0.16969593, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



(array([[[0.5625613 , 0.6563249 , 0.78965825],
        [0.5778799 , 0.668076  , 0.8014093 ],
        [0.58457416, 0.67477024, 0.8002604 ],
        ...,
        [0.08731618, 0.10556904, 0.02542054],
        [0.03883775, 0.06685576, 0.00281863],
        [0.07377954, 0.09295343, 0.02958195]],

       [[0.5636233 , 0.65774095, 0.7910743 ],
        [0.57703763, 0.6696538 , 0.80298716],
        [0.58639705, 0.6765931 , 0.8020833 ],
        ...,
        [0.08449324, 0.10094544, 0.01624876],
        [0.08271987, 0.10234351, 0.03717831],
        [0.06089633, 0.08396116, 0.03783677]],

       [[0.5704113 , 0.66060746, 0.7939408 ],
        [0.5828946 , 0.6745    , 0.8078333 ],
        [0.5903299 , 0.68052596, 0.80601615],
        ...,
        [0.07056478, 0.0823448 , 0.00787377],
        [0.05404412, 0.07729779, 0.0159577 ],
        [0.06505845, 0.09220306, 0.04411789]],

       ...,

       [[0.12327187, 0.15436007, 0.01495361],
        [0.11357565, 0.11382386, 0.03274141],
        [0.16567646, 

## For Text dataset
We will filter the txt file for empty ones, then split into text and labels and scale.

In [224]:
tf_dataset = tf.data.Dataset.list_files("C:/Users/swati/datasets/reviews/*/*", shuffle = False)

In [225]:
show_data(tf_dataset)

b'C:\\Users\\swati\\datasets\\reviews\\negative\\neg_1.txt'
b'C:\\Users\\swati\\datasets\\reviews\\negative\\neg_2.txt'
b'C:\\Users\\swati\\datasets\\reviews\\negative\\neg_3.txt'
b'C:\\Users\\swati\\datasets\\reviews\\positive\\pos_1.txt'
b'C:\\Users\\swati\\datasets\\reviews\\positive\\pos_2.txt'
b'C:\\Users\\swati\\datasets\\reviews\\positive\\pos_3.txt'


In [None]:
convert_abs(b'C:\Users\swati\datasets\reviews\positive\pos_3.txt')

### Filter for null reviews

In [207]:
import os
def filter_null(path):
    if len(str(tf.io.read_file(path).numpy(), encoding)) == 0:
        return False
    else:
        return True

### Get review type and Text

In [208]:
def get_review(path):
    review = tf.strings.split(path, sep = os.path.sep)[5]
    review_text = str(tf.io.read_file(path).numpy(),encoding)
    
    return review, review_text
    

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [None]:
tf.data.Dataset.list_files("C:/Users/swati/datasets/reviews/*/*", shuffle = False).map(lambda x: convert_abs(x)).map(lambda x: filter_null(x)).map(lambda x: get_review(x))

Cause: could not parse the source code of <function <lambda> at 0x000001EC7E465160>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x: get_review(x)

Match 1:
lambda x: filter_null(x)

Match 2:
lambda x: convert_abs(x)

Cause: could not parse the source code of <function <lambda> at 0x000001EC7E465160>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x: get_review(x)

Match 1:
lambda x: filter_null(x)

Match 2:
lambda x: convert_abs(x)

