# Occuring XOR problem with neural network
**inspired by this article from towardsdatascience: https://towardsdatascience.com/how-neural-networks-solve-the-xor-problem-59763136bdd7**

*By reading this article, I was like quite interested how neural network works, and how that kind of problem could occur*

of course here there is no datasets as it is relied on the the XOR gates

In [None]:
import numpy
import plotly
import pandas

In [129]:
class Sigmoid:
    def function(data):
        return 1 / (1 + numpy.exp(-data))
    
    def derivative(data):
        return Sigmoid.function(data) - Sigmoid.function(data)**2

class ReLU:
    def function(data):
        return numpy.maximum(data, 0)
    def derivative(data):
        return (data > 0) * 1

class Mse:
    # maybe to change
    def function(data, expected):
        return (1 / data.shape[0]) * numpy.sum((expected - data)**2)
        

    def derivative(y, predicted):
        return predicted - y

class NeuralNet:
    def __init__(self, layers, learning_rate, a_fct, epochs, minibatch=None):
        self.nets = []
        self.biases = []
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.a_fct = a_fct
        self.minibatch = minibatch
        layers = zip(layers, layers[1:])
        for i in layers:
            net = numpy.random.rand(i[1],i[0])
            b = numpy.random.rand(i[1])
            self.nets.append(net)
            self.biases.append(b)

    def feed_forward(self, data, training=False):
        a = [data]
        for i in range(0, len(self.nets)):
            data = numpy.dot(data, self.nets[i].T) + self.biases[i]
            if training:
                a.append(data)
            data = self.a_fct.function(data)
        if training:
            return a, data
        return data
    
    def evaluate(self, test_data):
        error = 0
        for ipt, expected in test_data:
            output = self.feed_forward(ipt)
            error += Mse.function(output, expected)
        return error / len(test_data)


    def train_sample(self, data):
        delta_b = [ numpy.zeros(self.biases[i].shape) for i in range(len(self.nets))]
        delta_w = [ numpy.zeros(self.nets[i].shape) for i in range(len(self.nets))]
        for feed,expected in data:
            a, data = self.feed_forward(feed, training=True)
            
            # first delta
            delta = Mse.derivative(expected, data) * self.a_fct.derivative(a[-1])
            
            delta_w[-1] += numpy.outer(delta, self.a_fct.function(a[-2]))
            delta_b[-1] += delta
            
            next_w_index = len(self.nets) - 2
            next_layer_index = (len(self.nets) + 1) - 2
            
            for i in range(next_w_index, -1, -1):
                delta = numpy.dot(self.nets[i + 1].T, delta) * self.a_fct.derivative(a[next_layer_index])
                delta_b[i] += delta
                if (next_layer_index - 1) == 0:
                    delta_w[i] += numpy.outer(delta, a[next_layer_index - 1])
                else:
                    delta_w[i] += numpy.outer(delta, self.a_fct.function(a[next_layer_index - 1]))
                next_layer_index = next_layer_index - 1
                
        return delta_b, delta_w

    def train(self, train_data, test_data):
        error_arr = [] 
        for i in range(0, self.epochs):
            self.mini_batch(train_data, 1/5)

            # return error rate
            error_rate = self.evaluate(test_data)
            error_arr.append(error_rate)
        return error_arr

    # decided to use the mini batch gd because it is a compromise between sgd(change gradient for each training data input) and bgd(change gradient after getting gradient from all train dataset)
    def mini_batch(self, train_data, minib_portion): # portion between 0 and 1
        numpy.random.shuffle(train_data)
        nb_of_input = len(train_data)
        k = self.minibatch
        if k == None:
            k = int(nb_of_input * minib_portion)

        for i in range(k, nb_of_input, k):
            
            delta_b, delta_w = self.train_sample(train_data[i - k:i])
            for i in range(len(self.biases)):
                self.biases[i] = self.biases[i] - self.learning_rate * delta_b[i] / k
                self.nets[i] = self.nets[i] - self.learning_rate * delta_w[i] / k
        # epoch finished

    # just to test the gd
    def with_one_sample(self, sample):
        # training 
        (feed, expected) = sample
        print("feed :", feed)
        delta_b = [ numpy.zeros(self.biases[i].shape) for i in range(len(self.nets))]
        delta_w = [ numpy.zeros(self.nets[i].shape) for i in range(len(self.nets))]
        a, data = self.feed_forward(feed, training=True)

        # first delta
        
        print(data)
        print(expected)
        
        delta = Mse.derivative(expected, data) * self.a_fct.derivative(a[-1])

        delta_w[-1] = numpy.outer(delta, self.a_fct.function(a[-2]))
        delta_b[-1] = delta

        next_w_index = len(self.nets) - 2
        next_layer_index = (len(self.nets) + 1) - 2

        
        for i in range(next_w_index, -1, -1):
            delta = numpy.dot(self.nets[i + 1].T, delta) * self.a_fct.derivative(a[next_layer_index])
            delta_b[i] = delta
            if (next_layer_index - 1) == 0:
                delta_w[i] = numpy.outer(delta, a[next_layer_index - 1])
            else:
                delta_w[i] = numpy.outer(delta, self.a_fct.function(a[next_layer_index - 1]))
            next_layer_index = next_layer_index - 1
        
        for i in range(len(self.biases)):
            self.biases[i]  -= self.learning_rate * delta_b[i]
            self.nets[i] -= self.learning_rate * delta_w[i]

        output = self.feed_forward(feed)
        error = numpy.sum(Mse.function(output, expected))
        
        return error
    
    def get_results(self, test_data):
        result = None
        for ipt in test_data:
            output = self.feed_forward(ipt)
            output = (output > 0.5).astype(int)
            if result is None:
                result = output
            else:
                result = numpy.concatenate((result, output))
        return result
        
    
# data scalling method: needed so that each variable is "considered" equally
# the downside of normalization is if in your data there is outsiders the results will be biased, on the other hand
# if I used standardization I will assumed that the given data is normally distributed

def normalization(data):
    data_min = numpy.min(data)
    data_max = numpy.max(data)
    return (data - data_min) / (data_max - data_min)


In [142]:
# first dataset

import plotly.express as px
import random

nn1 = NeuralNet([2, 4, 1], 0.1, ReLU, 200, 1) # easier for an additional layer
nn2 = NeuralNet([2, 1], 0.005, ReLU, 1000, 1) # with no layer we are not able to reach go bellow 0.25 of error


# interesting comparison between two activation function
# I have changed the sigmoid's lr because otherwise we wont see the slow down learning of that function
# more explanation can be see here at the top of the chapter: http://neuralnetworksanddeeplearning.com/chap3.html

inp = [numpy.array([0, 1]), numpy.array([1, 0]), numpy.array([0, 0]), numpy.array([1, 1])]
out = [numpy.array([1]), numpy.array([1]), numpy.array([0]), numpy.array([0])]

a = list(zip(inp, out))

random.shuffle(a)

train_data = a
test_data = a

fig = px.line(nn2.train(train_data, test_data), title='Loss with ReLU')
nn2.train(train_data, test_data)
fig.show()

In [55]:
print(test_data[0][0], test_data[0][1])
    
print(f"input = {test_data[0][0]} res = {nn1.feed_forward(test_data[0][0])}, expected ={test_data[0][1]}")
print(f"input = {test_data[1][0]} res = {nn1.feed_forward(test_data[1][0])}, expected ={test_data[1][1]}")
print(f"input = {test_data[2][0]} res = {nn1.feed_forward(test_data[2][0])}, expected ={test_data[2][1]}")
print(f"input = {test_data[3][0]} res = {nn1.feed_forward(test_data[3][0])}, expected ={test_data[3][1]}")

[1 0] [1]
input = [1 0] res = [0.99474452], expected =[1]
input = [0 1] res = [0.99792608], expected =[1]
input = [0 0] res = [0.00968289], expected =[0]
input = [1 1] res = [0.00293091], expected =[0]


In [53]:
print(test_data)

[(array([0, 1]), array([1])), (array([1, 0]), array([1])), (array([0, 0]), array([0])), (array([1, 1]), array([0]))]


In [6]:
# dig into the xor gate site, mse assuming data is gaussian, linear seperation ?, "nand", "or", "and" gates

In [58]:
test_data

[(array([1, 0]), array([1])),
 (array([0, 1]), array([1])),
 (array([0, 0]), array([0])),
 (array([1, 1]), array([0]))]

In [106]:
# this basically what the algorithm should classify
arr = numpy.array([[0, 0], [0, 1], [1, 0], [1, 1]])
fig = px.scatter(x=arr[:,0], y=arr[:,1])
fig.show()

# both classes aren't linearly separable

# decision boundary

In [135]:
r1, r2 = xx.flatten(), yy.flatten()
r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))
# horizontal stack vectors to create x1,x2 input for the model
grid = numpy.hstack((r1,r2))
data2 = pandas.DataFrame(columns=["x", "y"])
data2["x"] = grid[:,0]
data2["y"] = grid[:,1]

In [132]:
data2

Unnamed: 0,x,y
0,0.00,0.00
1,0.01,0.00
2,0.02,0.00
3,0.03,0.00
4,0.04,0.00
...,...,...
9995,0.95,0.99
9996,0.96,0.99
9997,0.97,0.99
9998,0.98,0.99


In [136]:
data2.insert(2, "c", nn2.get_results(data2[["x", "y"]].to_numpy()))

In [137]:
data2

Unnamed: 0,x,y,c
0,0.00,0.00,0
1,0.01,0.00,0
2,0.02,0.00,0
3,0.03,0.00,0
4,0.04,0.00,0
...,...,...,...
9995,0.95,0.99,1
9996,0.96,0.99,1
9997,0.97,0.99,1
9998,0.98,0.99,1


In [138]:
fig = px.scatter(data2, x="x", y="y",  color="c", title="decision boundary")
fig.show()

In [140]:
data3 = pandas.DataFrame(columns=["x", "y"])
data3["x"] = grid[:,0]
data3["y"] = grid[:,1]
data3.insert(2, "c", nn1.get_results(data3[["x", "y"]].to_numpy()))

In [141]:
fig = px.scatter(data3, x="x", y="y",  color="c", title="decision boundary with better neural network")
fig.show()