#### 2 layers neural network without pytorch

In [22]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# np.set_printoptions(threshold=np.inf)

data = pd.read_csv('./digit-recognizer/train.csv')
# print(data.head())
data = np.array(data)
m, n = data.shape
print(m,n)
# print(data)
np.random.shuffle(data) # shuffle before splitting into dev and training sets (dev set is the validation set)
'''
The shape of "data" is row: 42000, column: 785. It means there are 42000 images, each image has 784 pixels.
since the "data_dev" = data[0:1000], the validation dataset contain 1000 images;
data[0:1000] intend to enumerate the first 1000 rows of data.
"Y_dev" can obtain all label of image(only validation dataset). "X_dev" is only validation dataset.
'''
data_dev = data[0:1000].T
Y_dev = data_dev[0]
# print(Y_dev,Y_dev.shape)
X_dev = data_dev[1:n]
# print(X_dev,X_dev.shape)
X_dev = X_dev / 255.
'''
"data_train" is the training dataset, it contains 41000 images, but the shape is (785, 41000)
"Y_train" is the label of training image. Totally has 41000 images.
"x_train" is the training data and the shape is the (784, 41000)
'''
data_train = data[1000:m].T
print(f"The shape of data_train is: {data_train.shape}")
Y_train = data_train[0]
X_train = data_train[1:n]
print(f"the shape of X_train is:{X_train}")
X_train = X_train / 255.
_,m_train = X_train.shape

print(X_train.shape)
print("---------------------------------------")
'''
"W1" is populated from the uniform distribution over [0.5,0.5), and row: 10, column:784
"b1" is populated from the uniform distribution over [0.5,0.5), and row: 10, column:1
"W2" is populated from the uniform distribution over [0.5,0.5), and row: 10, column:10
"b2" is populated from the uniform distribution over [0.5,0.5), and row: 10, column:1
'''
def init_params():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A
    
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def ReLU_deriv(Z):
    return Z > 0

'''
Y.size() is 41000, Y.max() is 9.
one_hot_Y is [41000, 10]
np.arange(Y.size) is [0 1 2 3 4 5 6 ......40998 40999]
'''
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

'''
m is 42000,
'''

def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    one_hot_Y = one_hot(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1    
    W2 = W2 - alpha * dW2  
    b2 = b2 - alpha * db2    
    return W1, b1, W2, b2

def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    # print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 50 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2

W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.10, 500)

print("---------------------------------------------------------------------")

def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

def test_prediction(index, W1, b1, W2, b2):
    current_image = X_train[:, index, None]
    prediction = make_predictions(X_train[:, index, None], W1, b1, W2, b2)
    label = Y_train[index]
    print("Prediction: ", prediction)
    print("Label: ", label)

dev_predictions = make_predictions(X_dev, W1, b1, W2, b2)
get_accuracy(dev_predictions, Y_dev)



42000 785
(784, 41000)
---------------------------------------
Iteration:  0
0.08426829268292683
Iteration:  50
0.4757073170731707
Iteration:  100
0.6414878048780488
Iteration:  150
0.7106585365853658
Iteration:  200
0.7506585365853659
Iteration:  250
0.7777073170731708
Iteration:  300
0.7985365853658537
Iteration:  350
0.8144390243902438
Iteration:  400
0.8261463414634146
Iteration:  450
0.8355609756097561


0.848

In [5]:
def ReLU_deriv(Z):
    return Z > 0

print(ReLU_deriv(6))

import numpy as np

print(np.random.rand(10, 784))
print(np.random.rand(10, 784).shape)

True
[[7.63524191e-01 6.02502729e-01 6.57998167e-01 8.84426132e-01
  6.94462866e-01 7.37526902e-01 4.24933852e-01 4.30719710e-01
  2.38772278e-01 4.45453939e-01 1.18614804e-01 5.99180830e-01
  2.80341813e-01 2.04993623e-01 2.40017974e-01 6.87233222e-01
  1.87796780e-01 2.05115702e-01 5.12714843e-01 2.49407300e-01
  5.87354480e-01 2.62009130e-02 3.65319041e-01 1.47531172e-01
  5.58188590e-02 8.81816319e-01 9.49363568e-01 4.97647804e-01
  1.81318240e-01 7.82906060e-01 8.82986068e-01 9.75056728e-01
  6.59930785e-01 3.77687587e-01 7.99625202e-01 3.00578081e-01
  3.04088527e-01 5.04596841e-01 7.55685764e-01 7.87545360e-01
  2.91384394e-01 8.99071378e-01 1.91150175e-01 5.46558715e-01
  9.47834035e-01 1.39668223e-01 4.06657056e-01 4.06012110e-01
  1.80348900e-01 4.28884858e-01 4.67730885e-01 6.56355891e-01
  6.75660245e-01 4.06672697e-01 3.67245275e-01 7.53014045e-01
  3.15732991e-01 3.82606235e-01 3.55091408e-01 6.68291771e-01
  2.95413424e-01 9.14536306e-01 6.98889292e-01 7.94088496e-01
  3

In [7]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# np.set_printoptions(threshold=np.inf)

data = pd.read_csv('./digit-recognizer/train.csv')
# print(data.head())
data = np.array(data)
m, n = data.shape
print(m,n)
# print(data)
np.random.shuffle(data) # shuffle before splitting into dev and training sets (dev set is the validation set)
'''
The shape of "data" is row: 42000, column: 785. It means there are 42000 images, each image has 784 pixels.
since the "data_dev" = data[0:1000], the validation dataset contain 1000 images;
data[0:1000] intend to enumerate the first 1000 rows of data.
"Y_dev" can obtain all label of image(only validation dataset). "X_dev" is only validation dataset.
'''
data_dev = data[0:1000].T
Y_dev = data_dev[0]
# print(Y_dev,Y_dev.shape)
X_dev = data_dev[1:n]
# print(X_dev,X_dev.shape)
X_dev = X_dev / 255.
'''
"data_train" is the training dataset, it contains 41000 images, but the shape is (785, 41000)
"Y_train" is the label of training image. Totally has 41000 images.
"x_train" is the training data and the shape is the (784, 41000)
'''
data_train = data[1000:m].T
print(f"The shape of data_train is: {data_train.shape}")
Y_train = data_train[0]
X_train = data_train[1:n]
print(f"the shape of X_train is:{X_train}")
X_train = X_train / 255.
_,m_train = X_train.shape

print(X_train.shape)
print("---------------------------------------")

42000 785
The shape of data_train is: (785, 41000)
(784, 41000)
---------------------------------------


In [4]:
import numpy as np

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A

one_hot_Y= [[0,0,0,1,1,0],[0,0,0,0,0,1],[0,1,0,0,0,0],[1,0,1,0,0,0],[0,0,0,0,0,0]]
A2 = softmax([[1,2,3,4,5,6],[2,3,4,5,5,6],[3,4,5,6,5,1],[6,3,6,9,8,4],[1,3,4,2,2,6]])
dZ2 = A2 - one_hot_Y
print(dZ2)
print("--------------------------------------")
print(np.sum(dZ2))


[[ 6.22973401e-03  6.04452338e-02  2.94888201e-02 -9.93736527e-01
  -9.56776054e-01  3.18261201e-01]
 [ 1.69341728e-02  1.64307181e-01  8.01589239e-02  1.70258849e-02
   4.32239462e-02 -6.81738799e-01]
 [ 4.60318541e-02 -5.53366776e-01  2.17894546e-01  4.62811536e-02
   4.32239462e-02  2.14442710e-03]
 [-7.54254949e-02  1.64307181e-01 -4.07701214e-01  9.29581820e-01
   8.68176168e-01  4.30719698e-02]
 [ 6.22973401e-03  1.64307181e-01  8.01589239e-02  8.47668897e-04
   2.15199357e-03  3.18261201e-01]]
--------------------------------------
-3.885780586188048e-16


In [18]:
import numpy as np

a=np.array([[2,3,4],[6,5,7]])
b=np.array([[3,4,6,9],[2,5,6,5],[7,7,8,4]])
answer=a.dot(b)

print(answer)

[[ 40  51  62  49]
 [ 77  98 122 107]]


In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# np.set_printoptions(threshold=np.inf)

data = pd.read_csv('./digit-recognizer/train.csv')
# print(data.head())
data = np.array(data)
m, n = data.shape
print(m,n)
print("----------------------------------------------------------")
# print(data)
np.random.shuffle(data) # shuffle before splitting into dev and training sets (dev set is the validation set)
'''
The shape of "data" is row: 42000, column: 785. It means there are 42000 images, each image has 784 pixels.
since the "data_dev" = data[0:1000], the validation dataset contain 1000 images;
data[0:1000] intend to enumerate the first 1000 rows of data.
"Y_dev" can obtain all label of image(only validation dataset). "X_dev" is only validation dataset.
'''
data_dev = data[0:1000].T
Y_dev = data_dev[0]
# print(Y_dev,Y_dev.shape)
X_dev = data_dev[1:n]
# print(X_dev,X_dev.shape)
X_dev = X_dev / 255.
'''
"data_train" is the training dataset, it contains 41000 images, but the shape is (785, 41000)
"Y_train" is the label of training image. Totally has 41000 images.
"x_train" is the training data and the shape is the (784, 41000)
'''
data_train = data[1000:m].T
print(f"The shape of data_train is: {data_train.shape}")
Y_train = data_train[0]
np.set_printoptions(threshold=np.inf)
print(f"This is the Y_train: {Y_train}")
print(f"The shape of Y_train is: {Y_train.shape}")
print("-------------------------------------------------------")
print(Y_train.size)
print(Y_train.max() + 1)
print(np.arange(Y_train.size))


42000 785
----------------------------------------------------------
The shape of data_train is: (785, 41000)
This is the Y_train: [8 5 0 2 3 5 0 7 9 2 7 9 1 8 6 0 8 9 8 6 3 0 0 3 7 7 8 3 7 3 0 7 0 5 8 6 1
 4 2 2 1 3 1 9 6 1 8 6 3 7 0 8 9 0 7 6 8 9 7 4 2 1 2 1 9 7 4 9 6 2 0 7 8 2
 7 7 4 7 2 0 8 9 1 6 3 7 8 4 5 7 9 3 0 4 5 7 7 3 4 4 2 4 4 5 2 4 6 5 7 8 2
 8 2 9 4 1 1 1 8 7 5 8 1 4 5 6 4 3 4 2 7 7 6 3 3 0 1 2 9 6 2 0 8 5 3 7 4 7
 3 9 9 5 6 6 9 1 6 2 3 9 4 6 2 6 1 1 2 6 7 7 6 8 1 2 4 3 5 6 7 1 6 8 7 7 9
 6 9 0 5 2 7 7 2 9 6 2 2 9 0 9 9 1 2 9 7 8 2 0 5 9 8 7 9 4 3 0 9 7 4 0 2 9
 1 1 6 3 2 4 2 0 3 1 8 8 0 1 3 3 3 4 2 5 7 4 7 1 0 6 8 1 8 4 3 5 6 8 5 2 1
 0 0 4 0 8 2 4 2 1 0 1 2 8 7 0 2 3 9 7 0 2 0 9 9 1 5 6 6 9 8 1 2 4 4 9 4 7
 3 8 8 1 4 1 8 2 7 3 4 1 3 2 8 6 8 6 7 9 1 7 5 0 9 7 2 8 9 8 4 5 2 8 3 0 2
 9 1 6 6 9 0 1 3 1 1 4 8 4 0 7 2 7 6 7 8 6 5 8 8 2 2 6 1 4 8 3 2 6 3 2 9 8
 3 2 1 6 9 1 8 5 8 7 3 7 4 7 4 5 5 4 2 6 6 0 1 3 4 6 7 0 7 1 2 6 3 0 7 2 7
 4 1 4 4 5 1 1 0 4 3 9 6 9 6 3 4 7 3 9 5 6 7

In [6]:
import numpy as np

def get_predictions(A2):
    return np.argmax(A2, 0)
input = [[1,2,3,4,5,6],[2,3,4,5,5,2],[3,4,5,6,5,1],[6,3,6,9,8,4],[1,3,4,2,2,3]]
print(input)
A2 = softmax(input)
print(A2)
print("----------------------------------")
result = get_predictions(A2)
print(result)

[[1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 5, 2], [3, 4, 5, 6, 5, 1], [6, 3, 6, 9, 8, 4], [1, 3, 4, 2, 2, 3]]
[[6.22973401e-03 6.04452338e-02 2.94888201e-02 6.26347303e-03
  4.32239462e-02 8.26326131e-01]
 [1.69341728e-02 1.64307181e-01 8.01589239e-02 1.70258849e-02
  4.32239462e-02 1.51346910e-02]
 [4.60318541e-02 4.46633224e-01 2.17894546e-01 4.62811536e-02
  4.32239462e-02 5.56774167e-03]
 [9.24574505e-01 1.64307181e-01 5.92298786e-01 9.29581820e-01
  8.68176168e-01 1.11831081e-01]
 [6.22973401e-03 1.64307181e-01 8.01589239e-02 8.47668897e-04
  2.15199357e-03 4.11403556e-02]]
----------------------------------
[3 2 3 3 3 0]
