# Machine Learning Project

## Binary classification based on 3 layers neural network

### [ Implementation ]

#### (1) Libraries and Global variables

In [47]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torchvision
import torch
import os

# Global Variables
train_data_path = './horse-or-human/train'
validation_data_path = './horse-or-human/validation'

layer_dims = [10000,50,5,1]    # number of units(Neurons) in each layer
learning_rate = 0.005         # step size per each epoch (iteration)
max_epoch = 2500              # maximum number of epoch (iteration)

In [48]:
def initialize_inputs(image_path) :
    transform = transforms.Compose([transforms.Grayscale(),transforms.ToTensor(),])
    # the code transforms.Grayscale() is for changing the size [3,100,100] to [1, 100, 100]
    # (notice : [channel, height, width] )
    image_set = torchvision.datasets.ImageFolder(root=image_path, transform=transform)
    loader = torch.utils.data.DataLoader(image_set, batch_size=1, shuffle=False, num_workers=1)  

    for i,data in enumerate(loader) :
        image, label = data
        image = image.view(10000,1)
        label = label.view(1,1).type(torch.FloatTensor)
        
        if i == 0 :
            images = image
            labels = label
        else :
            images = torch.cat((images,image),dim = 1)
            labels = torch.cat((labels,label),dim = 1)
    
    return images, labels

#### (2) Generate Input matrix X and Output vector Y from training and Validation Dataset

In [49]:
X,Y = initialize_inputs(train_data_path)
t_X, t_Y = initialize_inputs(validation_data_path)

#### (3) Implementation - Activation / Parameters / Costs Functions

In [202]:
def initialize_parameters(layer_dims) :
    parameters = dict()
    L = len(layer_dims)    # number of layers in the NN
    
    for l in range(1,L) :
        parameters['W' + str(l)] = torch.randn(layer_dims[l],layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = torch.zeros(layer_dims[l],1)
    return parameters

def sigmoid(Z) :
    A = 1 / (1 + torch.exp(-Z))
    cache = Z
    return A, cache

def ReLU(Z) :
    zeros = torch.zeros(Z.size())
    A = torch.max(Z,zeros)
    cache = Z
    return A, cache

def cost_computation(A, Y) :
    return (-Y * torch.log(A) - (1 - Y) * torch.log(1 - A)).mean().item()

def update_parameters(parameters, gradients, learning_rate) :
    L = len(parameters) // 2 
    for l in range(L):
        parameters["W" + str(l+1)] -= learning_rate * gradients["dW" + str(l+1)]
        parameters["b" + str(l+1)] -= learning_rate * gradients["db" + str(l+1)]        
    return parameters

#### (4) Implementation - Forward Propagation

In [203]:
def forward_compute_Z(A,W,b) :
    Z = torch.mm(W,A) + b
    cache = (A,W,b)
    
    return Z, cache

def forward_compute_activation(A_prev,W,b,activation) :
    
    Z, parameters_cache = forward_compute_Z(A_prev,W,b)
    
    if activation == "sigmoid" :
        A, activation_cache = sigmoid(Z)
    elif activation == "ReLU" :
        A, activation_cache = ReLU(Z)

    cache = (parameters_cache, activation_cache)    # cache = ((A_prev,W,b), Z)
    return A, cache

def forward_propagation(X, parameters) :
    
    caches = []                # contains Z and (A,W,b) for each layers for backward propagation
                               # [0 ~ L-2] : ReLU(Matrices), [L-1] : Sigmoid(Vectors)
    A = X                      # initialize A as input X
    L = len(parameters) // 2   # number of layers in NN : the reason divide length by 2 is 
                               # because it contains W and b for each layer seperately 
                               # as parameters['W_l'] and parameters['b_l']
    # Forward propagation through hidden layers (1 <= l < L) - Use ReLU as activation function
    for l in range(1,L) :      
        A_prev = A
        W = parameters['W' + str(l)]
        b = parameters['b' + str(l)]
        A, cache = forward_compute_activation(A_prev, W,b, "ReLU")
#        (A_prev,W,b), Z = cache
#        print("A_prev :",A_prev.size(), ", W :", W.size(), ", b :", b.size(), ", Z :", Z.size())
    
        caches.append(cache)

    # Forward propagation at the output layer - Use sigmoid as activation function
    AL, cache = forward_compute_activation(A, parameters['W' + str(L)], parameters['b' + str(L)], "sigmoid")
#    (A_prev,W,b), Z = cache
#    print("A_prev :",A_prev.size(), ", W :", W.size(), ", b :", b.size(), ", Z :", Z.size())

    caches.append(cache)
            
    return AL, caches

#### (5) Implementation - Backward Propagation

In [204]:
def backward_compute_parameters(dA, cache, activation) :
    (A_prev,W,b), Z = cache         # cache contains (parameter_cache, activation_cache)
                                    # parameter_cache is (A_prev, W, b) and activation_cache is Z
    m = A_prev.size()[1]            # A_prev.size() = (layer_dims[l-2], # of examples)

#    print("backward_compute_parameters")
#    print("A_prev :",A_prev.size(), ", W :", W.size(), ", b :", b.size(), ", Z :", Z.size())
    
    if activation == "sigmoid" :
        A, _t = sigmoid(Z)          # A = sigmoid(Z), sigmoid`(Z) = A(1-A)
        dZ = dA * A * (1-A)         # dZ = dA * g`(Z), g(Z) = sigmoid(Z)

    elif activation == "ReLU" :
        dZ = dA.clone().detach()    # g(Z) = ReLU(Z), g`(Z) = 0 for Z < 0 g`(Z) = Z for Z >= 0
#        print("dZ : \n",dZ.size())
#        print("Z : \n", Z.size())

        dZ[Z <= 0] = 0              # dZ = dA * g`(Z) ==> dZ = 0 for Z <= 0 and Z for Z >= 0
    
    dW = torch.mm(dZ,A_prev.t()) / m
    db = torch.sum(dZ) / m
    dA_prev = torch.mm(W.t(), dZ)
#    print("\ndA_prev : ",dA_prev.size())
    return dA_prev, dW, db

def backward_propagation(AL, Y, caches) :
    gradients = dict()       # contains gradients(dA, dW, db) for each epoch (iteration)
    L = len(caches)          # number of layers
    m = AL.size()[1]         # number of examples (AL.size() = (layer_dims[L-1], # of examples)))
#    Y = Y.view(AL.size())    # ??????
    Y = torch.repeat_interleave(Y,AL.size()[0], dim = 0)

    dAL = - torch.div(Y,AL) + torch.div(1-Y, 1-AL)
#    print("<<<dAL :", dAL.size(), ">>>")
    crnt_cache = caches[L-1]
    t1,t2,t3 = backward_compute_parameters(dAL, crnt_cache, "sigmoid")
#    print("아아아악!!!", t1.size())
    gradients["dA" + str(L)] = t1
    gradients["dW" + str(L)] = t2
    gradients["db" + str(L)] = t3
    
    for l in reversed(range(L-1)) :
#        print("<< L : %d, l : %d >>" % (L,l))
#        print(gradients["dA" + str(l + 2)].size())
        crnt_cache = caches[l]
        t1,t2,t3 = backward_compute_parameters(gradients["dA" + str(l + 2)], crnt_cache, "ReLU")
        gradients["dA" + str(l + 1)] = t1
        gradients["dW" + str(l + 1)] = t2
        gradients["db" + str(l + 1)] = t3
        
    return gradients

#### (6) Implementation - predict function for accuracy calculation

In [199]:
def predict(X, y, parameters):
    
    AL, _t = forward_propagation(X, parameters)
    correct = torch.zeros(y.size())
    m = AL.size()[1]

    prediction = AL > 0.5
    correct = prediction == y.type(torch.uint8)
    
    accuracy = torch.sum(correct).item() / m * 100
    return accuracy

#### (7) Implementation - 3_layer Neural Network

In [200]:
def Neural_Network_w_3_layers(X,Y,layer_dims,learning_rate,max_epoch) :
    
    costs = []
    parameters = initialize_parameters(layer_dims)
    
    for epoch in range(0,max_epoch) :
#        print("---------------------------------- Layer %d ----------------------------------")
        AL, caches = forward_propagation(X, parameters)
        cost = cost_computation(AL, Y)
        gradients = backward_propagation(AL, Y, caches)
        
        parameters = update_parameters(parameters, gradients, learning_rate)
        costs.append(cost)

    return parameters

In [201]:
parameters = Neural_Network_w_3_layers(X,Y,layer_dims,0.005,2500)

Initialize W :  torch.Size([50, 10000]) , b : torch.Size([50, 1]) in layer 1 
Initialize W :  torch.Size([5, 50]) , b : torch.Size([5, 1]) in layer 2 
Initialize W :  torch.Size([1, 5]) , b : torch.Size([1, 1]) in layer 3 
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Siz

---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])


dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size

dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size(


dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size


dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size

A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
dZ : 
 torch.Size([50, 1027])
Z : 
 torch.Size([50, 1027])

dA_prev :  torch.Size([10000, 1027])
-------------

dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size(

dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size(

A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
dZ : 
 torch.Size([50, 1027])
Z : 
 torch.Size([50, 1027])

dA_prev :  torch.Size([10000, 1027])
-------------


dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size


dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size


dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size

dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size(

---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])


dA_prev :  torch.Size([10000, 1027])
---------------------------------- Layer %d ----------------------------------
<<<<<<Forward Propagation>>>>>>
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size([50, 1]) , Z : torch.Size([50, 1027])
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])
backward_compute_parameters
A_prev : torch.Size([5, 1027]) , W : torch.Size([1, 5]) , b : torch.Size([1, 1]) , Z : torch.Size([1, 1027])

dA_prev :  torch.Size([5, 1027])
backward_compute_parameters
A_prev : torch.Size([50, 1027]) , W : torch.Size([5, 50]) , b : torch.Size([5, 1]) , Z : torch.Size([5, 1027])
dZ : 
 torch.Size([5, 1027])
Z : 
 torch.Size([5, 1027])

dA_prev :  torch.Size([50, 1027])
backward_compute_parameters
A_prev : torch.Size([10000, 1027]) , W : torch.Size([50, 10000]) , b : torch.Size

KeyboardInterrupt: 

In [None]:
a = torch.randn(1,5)

b = torch.zeros(4,5)
a = torch.repeat_interleave(a,b.size()[0], dim = 0)
print(a.size())
print(a)



In [None]:
def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.
    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently
    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ

"""
=================================================================================================
"""


def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer (layer l)
    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1./m * np.dot(dZ,A_prev.T)
    db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T,dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

"""
=================================================================================================
"""

def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

"""
=================================================================================================
"""

def L_model_backward(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (there are (L-1) or them, indexes from 0 to L-2)
                the cache of linear_activation_forward() with "sigmoid" (there is one, index L-1)
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "AL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"]
    current_cache = caches[L-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
    
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) gradients.
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 2)], current_cache, activation = "relu")
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads