# Aim : To implement a cnn architecture as follows:
            1)Input is of format (n,height,width,depth)
              in this i have taken n = 1000 height = 32 width = 32 depth = 3
            2)I have assumed output has 10 classes out of which we have to predict ti which class our input image belongs
            3)Shape of input initially(n = 1000,32,32,1)
            4)This is passed through a convolution layer with filter of dimension(3 x 3 x 4) and padding same
            5)Output shape (1000 x 32 x 32 x 4)
            6)Relu layer is attatched after convolutin layer
            7)Output shape (1000 x 32 x 32 x 4)
            8)A max pooling layer is attatched
            9)Output shape (1000 x 16 x 16 x 4)
           10)After this output is flattened to pass through a fully connected layer
           11)Output shape (1000 x 1024)
           12)Fully connected layer is attatched and it gives 10 outputs each containing probaibility of each class
           13)output shape (1000 x 10)
           14)From given 10 probabilities for 1000 examples the max probaibility out of ten is our prediction
           15)Final prediction
           16)I have stored info in a variable cache for implementing back propogation when required

In [144]:
import numpy as np

# initialising input with format (No. of images,height,width,depth)

In [145]:
x = np.empty([1000,32,32,1])
height = x.shape[1]
width = x.shape[2]
depth = x.shape[3]
x.shape

(1000, 32, 32, 1)

# Initialising weights bias stride and pad for convolution 

In [146]:
w = np.random.randn(3, 3, 1, 4)
b = np.random.randn(1, 1, 1, 8)
hparameters = {"pad" : 17,
               "stride": 2}

# function for padding

In [147]:
def zero_pad(X, pad):
    X_pad = np.pad(X, ((0, 0), (pad, pad), (pad, pad), (0, 0)), 'constant', constant_values=0)
    return X_pad

# convolution for single step 

In [148]:
def conv_single_step(a_slice_prev, W, b):
   
    # Element-wise product between a_slice and W. Add bias.
    s = np.multiply(a_slice_prev, W) + b
    # Sum over all entries of the volume s
    Z = np.sum(s)
    return Z

# full convolution

In [149]:
def conv_forward(A_prev, W, b, hparameters):
    
    # Retrieve dimensions from A_prev's shape (≈1 line)  
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    
    # Retrieve dimensions from W's shape (≈1 line)
    (f, f, n_C_prev, n_C) = W.shape

    # Retrieve information from "hparameters" (≈2 lines)
    stride = hparameters['stride']
    pad = hparameters['pad']
    
    # Compute the dimensions of the CONV output volume using the formula given above. Hint: use int() to floor. (≈2 lines)
    n_H = int((n_H_prev - f + 2 * pad) / stride) + 1
    n_W = int((n_W_prev - f + 2 * pad) / stride) + 1
    
    # Initialize the output volume Z with zeros. (≈1 line)
    Z = np.zeros((m, n_H, n_W, n_C))
    
    # Create A_prev_pad by padding A_prev
    A_prev_pad = zero_pad(A_prev, pad)
    
    for i in range(m):                                 # loop over the batch of training examples
        a_prev_pad = A_prev_pad[i]                     # Select ith training example's padded activation
        for h in range(n_H):                           # loop over vertical axis of the output volume
            for w in range(n_W):                       # loop over horizontal axis of the output volume
                for c in range(n_C):                   # loop over channels (= #filters) of the output volume
                    # Find the corners of the current "slice" (≈4 lines)
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    # Use the corners to define the (3D) slice of a_prev_pad (See Hint above the cell). (≈1 line)
                    a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]
                    # Convolve the (3D) slice with the correct filter W and bias b, to get back one output neuron. (≈1 line)
                    Z[i, h, w, c] = conv_single_step(a_slice_prev, W[...,c], b[...,c])
                                        
    # Making sure your output shape is correct
    assert(Z.shape == (m, n_H, n_W, n_C))
    
    # Save information in "cache" for the backprop
    cache = (A_prev, W, b, hparameters)
    
    return Z,cache

In [150]:
z,cache_conv = conv_forward(x,w,b,hparameters)
z.shape


(1000, 32, 32, 4)

# function for applying relu activation function 

In [151]:
def ReLU(x):
    return abs(x) * (x > 0)

def activation(z):
    for i in range(z.shape[0]):
        z[i] = ReLU(z[i])
    return z
    

In [152]:
A = activation(z)
A.shape

(1000, 32, 32, 4)

# pooling layer to change shape 

In [153]:
def pool_forward(A_prev, hparameters, mode = "max"):
    
    # Retrieve dimensions from the input shape
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    
    # Retrieve hyperparameters from "hparameters"
    f = hparameters["f"]
    stride = hparameters["stride"]
    
    # Define the dimensions of the output
    n_H = int(1 + (n_H_prev - f) / stride)
    n_W = int(1 + (n_W_prev - f) / stride)
    n_C = n_C_prev
    
    # Initialize output matrix A
    A = np.zeros((m, n_H, n_W, n_C))              
    
    for i in range(m):                           # loop over the training examples
        for h in range(n_H):                     # loop on the vertical axis of the output volume
            for w in range(n_W):                 # loop on the horizontal axis of the output volume
                for c in range (n_C):            # loop over the channels of the output volume
                    
                    # Find the corners of the current "slice" (≈4 lines)
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    
                    # Use the corners to define the current slice on the ith training example of A_prev, channel c. (≈1 line)
                    a_prev_slice = A_prev[i, vert_start:vert_end, horiz_start:horiz_end, c]
                    
                    # Compute the pooling operation on the slice. Use an if statment to differentiate the modes. Use np.max/np.mean.
                    if mode == "max":
                        A[i, h, w, c] = np.max(a_prev_slice)
                    elif mode == "average":
                        A[i, h, w, c] = np.mean(a_prev_slice)
    
    # Store the input and hparameters in "cache" for pool_backward()
    cache = (A_prev, hparameters)
    
    # Making sure your output shape is correct
    assert(A.shape == (m, n_H, n_W, n_C))
    
    return A

In [154]:
hparameters = {"f" : 2,
               "stride": 2}
A = pool_forward(A,hparameters)
A.shape

(1000, 16, 16, 4)

# now we are going to flatten each image to pass through fully connected layer

In [155]:
A = A.reshape(1000,1024)
A.shape

(1000, 1024)

# fully connected layer

In [156]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()
def fully_connected(A,w,b):
    c = A
    A = np.dot(A,w) + b
    for i in range(A.shape[0]):
        A[i] = softmax(A[i])
    cache = (c,w,b)
    return A,cache

# initialising weights and bias for final layer

In [157]:
w = np.random.rand(1024,10)
b = np.random.rand(1000,10)

# now our architecture is complete and we will find output from fully connected layer which contains 1000 arrays with 10 final arrays that have stored probaibility of each of posible 10 classes in it and the one class having max probaibility is the class to which the image belongs

In [158]:
Output,cache = fully_connected(A,w,b)
Output.shape

(1000, 10)

# cost function to calculate the cost

In [159]:
def compute_cost(AL, Y):
    
    m = Y.shape[1]

    # Compute loss from aL and y.
    cost = (-1 / m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1 - Y, np.log(1 - AL)))
    ### END CODE HERE ###
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape == ())
    
    return cost

# Back propogation for fully connected layer

In [160]:
def backward_propFClayer(dZ, cache):
   
    A_prev, W, b = cache
    m = A_prev.shape[1]
    db = np.empty([1000,10])
    dW = np.dot(cache[0].T,dZ) / m
    db += dZ
    db = db/m
    dA_prev = np.dot(dZ,cache[1].T)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

In [164]:
da, dw, db = backward_propFClayer(Output,cache)

# in the above block we implemented back propogation and get dA, dw, db and we can use the to reduce cost by changing weights and biases 
# w = w - (learning_rate)*dw  
# b = b - (learning_rate)*db

# Back propogation for convolution layers to learn weights

In [165]:
def conv_backward(dZ, cache):
   
    # Retrieve information from "cache"
    (A_prev, W, b, hparameters) = cache
    
    # Retrieve dimensions from A_prev's shape
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    
    # Retrieve dimensions from W's shape
    (f, f, n_C_prev, n_C) = W.shape
    
    # Retrieve information from "hparameters"
    stride = hparameters["stride"]
    pad = hparameters["pad"]
    
    # Retrieve dimensions from dZ's shape
    (m, n_H, n_W, n_C) = dZ.shape
    
    # Initialize dA_prev, dW, db with the correct shapes
    dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))                           
    dW = np.zeros((f, f, n_C_prev, n_C))
    db = np.zeros((1, 1, 1, n_C))

    # Pad A_prev and dA_prev
    A_prev_pad = zero_pad(A_prev, pad)
    dA_prev_pad = zero_pad(dA_prev, pad)
    
    for i in range(m):                       # loop over the training examples
        
        # select ith training example from A_prev_pad and dA_prev_pad
        a_prev_pad = A_prev_pad[i]
        da_prev_pad = dA_prev_pad[i]
        
        for h in range(n_H):                   # loop over vertical axis of the output volume
            for w in range(n_W):               # loop over horizontal axis of the output volume
                for c in range(n_C):           # loop over the channels of the output volume
                    
                    # Find the corners of the current "slice"
                    vert_start = h
                    vert_end = vert_start + f
                    horiz_start = w
                    horiz_end = horiz_start + f
                    
                    # Use the corners to define the slice from a_prev_pad
                    a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]

                    # Update gradients for the window and the filter's parameters using the code formulas given above
                    da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
                    dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
                    db[:,:,:,c] += dZ[i, h, w, c]
                    
        # Set the ith training example's dA_prev to the unpaded da_prev_pad (Hint: use X[pad:-pad, pad:-pad, :])
        dA_prev[i, :, :, :] = da_prev_pad[pad:-pad, pad:-pad, :]
   
    
    # Making sure your output shape is correct
    assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))
    
    return dA_prev, dW, db

In [168]:
dA,dw,db = conv_backward(z,cache_conv)

# in the above block we implemented back propogation and get dA, dw, db and we can use the to reduce cost by changing weights and biases 
# w = w - (learning_rate)*dw  
# b = b - (learning_rate)*db

# We will run above code for n iteration and reduce the cost to as minimum as possible