In [11]:
import sys
import numpy as np
import matplotlib.pyplot as plt

# Definition

Generators are functions in python that behave like iterators. The key aspect about iterators is that, as the name implies, we can iterate through them. This means that they remember their internal state. Functions do not remember their state.

In addition to iterability, generators are useful because they can be more memory efficient than using standard functions, and use lazy evaluation, i.e., they only compute when needed.

Imagine for example, that we wanted to return a list of the squares of numbers up to n.

We can write a simple function that does this, as well as a generator. Let's see what the differences are.

In [36]:
#traditional function    
def return_squares(n):
    squares = [n**2 for n in np.arange(n)]
    return squares


# Usage
n_squares = return_squares(500)
print(len(n_squares))

500


Here, notice that the length of n_squares is 500, and it is all stored in memory. But what if we didn't need that - we just needed to know the square of the number, let's say m (m<=n), and then we needed m+1, m+2, etc.? A generator solves this problem.

In [39]:

#Define the generator
def create_squares_generator(n):
    for i in range(n):
        yield i**2

n_squares_from_generator = create_squares_generator(500) #create a generator

#Now iterate through it
#not held in memory!
for i in n_squares_from_generator: 
    print(i)

0
1
4
9
16
25
36
49
64
81
100
121
144
169
196
225
256
289
324
361
400
441
484
529
576
625
676
729
784
841
900
961
1024
1089
1156
1225
1296
1369
1444
1521
1600
1681
1764
1849
1936
2025
2116
2209
2304
2401
2500
2601
2704
2809
2916
3025
3136
3249
3364
3481
3600
3721
3844
3969
4096
4225
4356
4489
4624
4761
4900
5041
5184
5329
5476
5625
5776
5929
6084
6241
6400
6561
6724
6889
7056
7225
7396
7569
7744
7921
8100
8281
8464
8649
8836
9025
9216
9409
9604
9801
10000
10201
10404
10609
10816
11025
11236
11449
11664
11881
12100
12321
12544
12769
12996
13225
13456
13689
13924
14161
14400
14641
14884
15129
15376
15625
15876
16129
16384
16641
16900
17161
17424
17689
17956
18225
18496
18769
19044
19321
19600
19881
20164
20449
20736
21025
21316
21609
21904
22201
22500
22801
23104
23409
23716
24025
24336
24649
24964
25281
25600
25921
26244
26569
26896
27225
27556
27889
28224
28561
28900
29241
29584
29929
30276
30625
30976
31329
31684
32041
32400
32761
33124
33489
33856
34225
34596
34969
35344
35721
36100


In [40]:
n_squares_from_generator

<generator object create_squares_generator at 0x11a849fc0>

In [41]:
print("Size of function method is {} and the size of the generator method is {}".format(
    sys.getsizeof(n_squares), sys.getsizeof(n_squares_from_generator)))

Size of function method is 4216 and the size of the generator method is 104


In [43]:
#Restart the generator
n_squares_from_generator = create_squares_generator(5) #create a generator

In [49]:
next(n_squares_from_generator)

StopIteration: 

# Machine Learning Usecase

Here is one way we can use it. Imagine that we want to train an ML model, and we have some images and labels associated with those images. We want to augment the images, by rotations, croppings, and noise additions.

Let's try to write a generator that will return a new batch of training data for our ML model.

Here are steps for you to try

1. Load some images, say 10 images. Resize them so they are all the same size. Call this X_train.
2. Create some labels, you can just randomize the labels for now, call this y_train.
3. Write the generator function, that takes as input the data and the number of batches you want the generator to generate
4. Use the generator to generate batches, and plot them using a plot_batch() function that will take as input the data and labels, and plot hte data with labels as the titles.
   



In [63]:
import SciFiReaders as sr

sample_data_path = r'/Users/rvv/ORNL Dropbox/Rama Vasudevan/DTMicroscope Data/PFM Images/pfm_images.h5'
nsid_reader = sr.NSIDReader(sample_data_path)
sample_data = nsid_reader.read()

In [70]:
data = [np.array(sample_data[key]) for key in sample_data.keys()]

In [71]:
data[0]

array([[ 7.13384907e-10,  8.57340865e-10,  6.92040203e-10, ...,
         5.45185230e-10,  6.34400976e-10,  6.40966391e-10],
       [ 6.73765044e-10,  8.85449936e-10,  7.89242449e-10, ...,
         6.27522923e-10,  6.47503384e-10,  6.55973054e-10],
       [ 7.73923148e-10,  9.13559006e-10,  8.16527290e-10, ...,
         6.40000053e-10,  6.60605792e-10,  6.70951295e-10],
       ...,
       [ 2.50150833e-09,  2.58680188e-09,  2.56807198e-09, ...,
        -3.21665539e-09, -3.31698402e-09, -3.04126502e-09],
       [ 2.60163802e-09,  2.68477152e-09,  2.66524580e-09, ...,
        -3.13431769e-09, -3.30388161e-09, -3.09614734e-09],
       [ 2.77165668e-09,  2.78274115e-09,  2.62267008e-09, ...,
        -3.12184056e-09, -3.29077920e-09, -3.08116910e-09]])

In [87]:
#Let's create X_train, which is a numpy array containing our training images

good_indices

[8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87]

In [86]:
random_selection

array([50, 51, 42, 12, 87, 33,  9, 35, 86, 64, 26, 32, 85, 11, 52, 79, 19,
       44, 71, 55, 34, 22, 67, 18, 23, 58, 65, 49, 37, 13, 66, 20, 43, 80,
       78, 17, 47, 40, 36, 62, 59, 27, 24, 61, 69, 57, 68, 10, 48, 72])

In [85]:
#training images array
training_data_size = 50

X_train = np.zeros(shape=(50,256,256)) #fix the size, and we will use 50 images.

good_indices = [ind for ind in range(len(data)) if data[ind].shape==(256,256)] #ensure data is the correct shape

#select 50 random images from the good index
random_selection = np.random.choice(good_indices, size=training_data_size, replace = False)

for ind, index in enumerate(random_selection):
    X_train[ind,:,:] = data[index]

In [105]:
#Now we need to create labels
y_train = np.zeros((training_data_size,3)) #we have 3 classes and 50 training data points

#Let's create some one-hot encoding first

#first label is [100]
#Second label is [010]
#Third label is [001]

def one_hot_encoder(class_classification, number_of_classes=3):
    one_hot_vec = np.zeros(number_of_classes)
    one_hot_vec[class_classification]=1
    return one_hot_vec


In [104]:
a = one_hot_encoder(2,5)
print(a)

[0. 0. 1. 0. 0.]


In [108]:
#populate with some random class assignments
or ind in range(training_data_size):
    y_train[ind,:] = one_hot_encoder(np.random.randint(low=0,high=3,),3)


In [109]:
#Now let's create the generator


array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1

In [124]:
#Define the generator
def return_subset_images(X_train,y_train,number_of_batches=10, batch_size=8):
    for i in range(number_of_batches):
        #add your code for random rotations, flips, noise addition, zooms, etc.
        #Your code goes here
        
        random_choice = np.random.choice(np.arange(X_train.shape[0]),batch_size)
        yield X_train[random_choice,:,:], y_train[random_choice,:]
        

In [125]:
image_subset_gen = return_subset_images(X_train, y_train)

In [126]:
for ind, (xtrain,ytrain) in enumerate(image_subset_gen):
    print(ind, xtrain.shape, ytrain.shape)

0 (8, 256, 256) (8, 3)
1 (8, 256, 256) (8, 3)
2 (8, 256, 256) (8, 3)
3 (8, 256, 256) (8, 3)
4 (8, 256, 256) (8, 3)
5 (8, 256, 256) (8, 3)
6 (8, 256, 256) (8, 3)
7 (8, 256, 256) (8, 3)
8 (8, 256, 256) (8, 3)
9 (8, 256, 256) (8, 3)


In [None]:
# Plot the resulting images batch by batch, have a function that you can call that will plot each batch (plot_batch(X_train,y_train))