# Machine Learning Project

## Binary classification based on 3 layers neural network

### [ Implementation ]

#### (1) Libraries and Global variables

In [194]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torchvision
import torch
import math
import os

# Global Variables
train_data_path = './horse-or-human/train'
validation_data_path = './horse-or-human/validation'

layer_dims = [10000,50,10,1]     # number of units(Neurons) in each layer
learning_rate = 0.02            # step size per each epoch (iteration)
threshold = 0.1                   # minimum of cost
max_epoch = 2000                  # maximum number of epoch (iteration)

In [195]:
def initialize_inputs(image_path) :
    transform = transforms.Compose([transforms.Grayscale(),transforms.ToTensor(),])
    # the code transforms.Grayscale() is for changing the size [3,100,100] to [1, 100, 100]
    # (notice : [channel, height, width] )
    image_set = torchvision.datasets.ImageFolder(root=image_path, transform=transform)
    loader = torch.utils.data.DataLoader(image_set, batch_size=1, shuffle=False, num_workers=1)  

    for i,data in enumerate(loader) :
        image, label = data
        image = image.view(10000,1)
        label = label.view(1,1).type(torch.FloatTensor)
        
        if i == 0 :
            images = image
            labels = label
        else :
            images = torch.cat((images,image),dim = 1)
            labels = torch.cat((labels,label),dim = 1)
    
    return images, labels

#### (2) Generate Input matrix X and Output vector Y from training and Validation Dataset

In [196]:
X,Y = initialize_inputs(train_data_path)
t_X, t_Y = initialize_inputs(validation_data_path)

#### (3) Implementation - Activation / Parameters / Costs Functions

In [197]:
def initialize_parameters(layer_dims) :
    parameters = dict()
    L = len(layer_dims)    # number of layers in the NN
    
    for l in range(1,L) :
        parameters['W' + str(l)] = torch.randn(layer_dims[l],layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = torch.zeros(layer_dims[l],1)
    return parameters

def sigmoid(Z) :
    A = 1 / (1 + torch.exp(-Z))
    cache = Z
    return A, cache

def ReLU(Z) :
    zeros = torch.zeros(Z.size())
    A = torch.max(Z,zeros)
    cache = Z
    return A, cache

def cost_computation(A, Y) :
    cost = ((-torch.mm(Y,torch.log(A).t()) - torch.mm(1-Y,torch.log(1-A).t())) / A.size()[1]).item()
    return cost

def update_parameters(parameters, gradients, learning_rate) :
    L = len(parameters) // 2 
    for l in range(L):
        parameters["W" + str(l+1)] -= learning_rate * gradients["dW" + str(l+1)]
        parameters["b" + str(l+1)] -= learning_rate * gradients["db" + str(l+1)]      
    return parameters

#### (4) Implementation - Forward Propagation

In [198]:
def forward_compute_Z(A,W,b) :
    Z = torch.mm(W,A) + b
    cache = (A,W,b)
    
    return Z, cache

def forward_compute_activation(A_prev,W,b,activation) :
    
    Z, parameters_cache = forward_compute_Z(A_prev,W,b)
    
    if activation == "sigmoid" :
        A, activation_cache = sigmoid(Z)
    elif activation == "ReLU" :
        A, activation_cache = ReLU(Z)

    cache = (parameters_cache, activation_cache)    # cache = ((A_prev,W,b), Z)
    return A, cache

def forward_propagation(X, parameters) :
    
    caches = []                # contains Z and (A,W,b) for each layers for backward propagation
                               # [0 ~ L-2] : ReLU(Matrices), [L-1] : Sigmoid(Vectors)
    A = X                      # initialize A as input X
    L = len(parameters) // 2   # number of layers in NN : the reason divide length by 2 is 
                               # because it contains W and b for each layer seperately 
                               # as parameters['W_l'] and parameters['b_l']
    # Forward propagation through hidden layers (1 <= l < L) - Use ReLU as activation function
    for l in range(1,L) :      
        A_prev = A
        W = parameters['W' + str(l)]
        b = parameters['b' + str(l)]
        A, cache = forward_compute_activation(A_prev, W,b, "ReLU")
        (A_prev,W,b), Z = cache
#        print("<<Layer %d>>" % l)
#        print(A[1])
#        print("A :", A[1], "\nW :", W[1], "\nb :", b[1], "\nZ :",Z[1])
#        print("A_prev :",A_prev.size(), ", W :", W.size(), ", b :", b.size(), ", Z :", Z.size())
        caches.append(cache)

    # Forward propagation at the output layer - Use sigmoid as activation function
    AL, cache = forward_compute_activation(A, parameters['W' + str(L)], parameters['b' + str(L)], "sigmoid")
#    (A_prev,W,b), Z = cache
#    print("A_prev :",A_prev.size(), ", W :", W.size(), ", b :", b.size(), ", Z :", Z.size())
#    print("<<Layer %d>>" % L)
#    print(AL)
#    print(AL)
#    print(parameters['W' + str(L)],"\n", parameters['b' + str(L)])

    caches.append(cache)
            
    return AL, caches

#### (5) Implementation - Backward Propagation

In [199]:
def backward_compute_parameters(dA, cache, activation) :
    (A_prev,W,b), Z = cache         # cache contains (parameter_cache, activation_cache)
                                    # parameter_cache is (A_prev, W, b) and activation_cache is Z
    m = A_prev.size()[1]            # A_prev.size() = (layer_dims[l-2], # of examples)

#    print("backward_compute_parameters")
#    print("A_prev :",A_prev.size(), ", W :", W.size(), ", b :", b.size(), ", Z :", Z.size())
    
    if activation == "sigmoid" :
        A, _t = sigmoid(Z)          # A = sigmoid(Z), sigmoid`(Z) = A(1-A)
        dZ = dA * A * (1-A)         # dZ = dA * g`(Z), g(Z) = sigmoid(Z)
        #print(dZ)

    elif activation == "ReLU" :
        dZ = dA.clone().detach()    # g(Z) = ReLU(Z), g`(Z) = 0 for Z < 0 g`(Z) = Z for Z >= 0
#        print("dZ : \n",dZ.size())
#        print("Z : \n", Z.size())

        dZ[Z <= 0] = 0              # dZ = dA * g`(Z) ==> dZ = 0 for Z <= 0 and Z for Z >= 0
        
    dW = torch.mm(dZ,A_prev.t()) / m
    db = torch.sum(dZ) / m
    dA_prev = torch.mm(W.t(), dZ)
#    print("\ndA_prev : ",dA_prev.size())
    #print(db)
    return dA_prev, dW, db

def backward_propagation(AL, Y, caches) :
    gradients = dict()       # contains gradients(dA, dW, db) for each epoch (iteration)
    L = len(caches)          # number of layers
    m = AL.size()[1]         # number of examples (AL.size() = (layer_dims[L-1], # of examples)))
    Y = torch.repeat_interleave(Y,AL.size()[0], dim = 0)

    dAL = - torch.div(Y,AL) + torch.div(1-Y, 1-AL)
    crnt_cache = caches[L-1]
    t1,t2,t3 = backward_compute_parameters(dAL, crnt_cache, "sigmoid")
    gradients["dA" + str(L)] = t1
    gradients["dW" + str(L)] = t2
    gradients["db" + str(L)] = t3
    
    for l in reversed(range(L-1)) :
        crnt_cache = caches[l]
        t1,t2,t3 = backward_compute_parameters(gradients["dA" + str(l + 2)], crnt_cache, "ReLU")
        gradients["dA" + str(l + 1)] = t1
        gradients["dW" + str(l + 1)] = t2
        gradients["db" + str(l + 1)] = t3
        
    return gradients

#### (6) Implementation - predict function for accuracy calculation

In [200]:
def predict(X, y, parameters):
    
    AL, _t = forward_propagation(X, parameters)
    correct = torch.zeros(y.size())
    m = AL.size()[1]

    prediction = AL > 0.5
    correct = prediction == y.type(torch.uint8)
    
    accuracy = torch.sum(correct).item() / m * 100
    return accuracy

#### (7) Implementation - 3_layer Neural Network

In [201]:
def Neural_Network_w_3_layers(X,Y,t_X,t_Y,layer_dims,learning_rate,threshold,max_epoch) :
    
    costs = []
    t_costs = []
    parameters = initialize_parameters(layer_dims)
    
    for epoch in range(0,max_epoch) :
#        print("========================== [ Epoch : %d ] ==========================" % epoch)
        AL, caches = forward_propagation(X, parameters)
        t_AL, _t = forward_propagation(t_X, parameters)
        
        cost = cost_computation(AL, Y)
        t_cost = cost_computation(t_AL, t_Y)
        
        gradients = backward_propagation(AL, Y, caches)
        if epoch % 100 == 0 :
            print("Cost in %d-th iteration : %.6f" % (epoch, cost))
#            print(gradients.items())

        
        parameters = update_parameters(parameters, gradients, learning_rate)
        costs.append(cost)
        t_costs.append(t_cost)
    
        if cost <= threshold :    break
        else :                    pass

    return costs,t_costs, parameters

In [None]:
costs, t_costs, parameters = Neural_Network_w_3_layers(X,Y,t_X,t_Y,layer_dims,learning_rate,threshold,max_epoch)

Cost in 0-th iteration : 0.693144
Cost in 100-th iteration : 0.692910
Cost in 200-th iteration : 0.692817
Cost in 300-th iteration : 0.692768
Cost in 400-th iteration : 0.692726
Cost in 500-th iteration : 0.692675
Cost in 600-th iteration : 0.692594
Cost in 700-th iteration : 0.692454
Cost in 800-th iteration : 0.692186
Cost in 900-th iteration : 0.691616
Cost in 1000-th iteration : 0.690202
Cost in 1100-th iteration : 0.685719
Cost in 1200-th iteration : 0.666428
Cost in 1300-th iteration : 0.594936


In [None]:
pred_train = predict(X, Y, parameters)
print(pred_train)

pred_test = predict(t_X, t_Y, parameters)
print(pred_test)


In [None]:
plt.plot([x for x in range(len(costs))],costs)
plt.show()
plt.plot([x for x in range(len(t_costs))],t_costs)
plt.show()