### sigmoid (logistic) activation function 
$$G(\Theta x)=\frac{1}{1+e^{-\Theta^Tx}}$$
vector of activation nodes j
$$a^{(j)}=
g(z^j)=
g(\Theta^{(j-1)}a_{j-1})=
\frac{1}{1+e^{-\Theta^Tx}}$$

### cost function $J(\Theta)$ + regularization
$$J(\theta)= \frac{1}{m}\sum\limits_{i=1}^{m} \sum\limits_{k=1}^K [-y_k^{(i)}\log{((h_\theta(x^{(i)}))_k)}-(1-y_k^{(i)})\log{(1-(h_\Theta(x^{(i)}))_k)}] 
+ \frac{\lambda}{2m} \sum\limits_{l=1}^{L-1} \sum\limits_{i=1}^{s_l} \sum\limits_{j=1}^{s_l-1} (\Theta_{j,i}^{(l)})^2$$

L= total number of layers in the network

sl= number of units (not counting bias unit) in layer l

K= number of output units/classes

m= number of examples

### Regularization
- If you have lots of features and little data - overfitting can be a problem, if  the regularization parameter λ is too big, theta is close to zero this results underfitting
- Keep all features, but reduce magnitude of parameters θ
- Works well when we have a lot of features, each of which contributes a bit to predicting y
- Help to avoid overfitting
##### suggestions
- Do not regularize terms that correspond to the bias

### Hipothesis
$$h_\Theta=a^{(L)}$$
### Backpropagation
$$D^{(l)}_{i,j}=\frac{\partial}{\partial\Theta_{i,j}^{(l)}}J(\Theta)=\frac{1}{m}\sum\limits^m_{t=1}a_j^{(t)(l)}\partial_i^{(t)(l+1)}=\partial^{(l+1)(a^{(l)})^T}$$

$D^{(l)}_{i,j}=\frac{1}{m}(\Delta^{(l)}_{i,j}+\lambda\Theta^{(l)}_{i,j})$ if $j\neq 0$

$D^{(l)}_{i,j}=\frac{1}{m}\Delta^{(l)}_{i,j}$ if $j=0$

L - number of last layer ( how many layers)

$\delta^{(l)}_j$ => error of node j in layer l

$\delta^{(L)}=a^{(L)}-y$ => error of last node

$\delta^{(l)}=((\Theta^{(l)})^T\delta^{(l+1)}.*g'(z^{(l)})$ 

$g'(z^{(l)}=a^{(l)}.*(1-a^{(l)})$


In [1]:
####################################################################################################################
# Machine Learning by Stanford University, using python(numpy,scipy) to implement neural network instead of octave #
####################################################################################################################

import numpy as np
import scipy
from scipy.special import expit as sigmoid #sigmoid function, assigns a number from (0,1) interval, to a real number
from scipy.optimize import fmin_cg #Minimize a function using a nonlinear conjugate gradient algorithm

In [2]:
#def sigmoid(x): #sigmoid function, assigns a number from (0,1) interval, to a real number
#    return 1/(1+np.exp(-x))
#----------------------- replaced by scipy.special.expit (much faster) -----------------------#

In [3]:
#TODO numpy matrixes shouldnt be stored in python list(a and theta)
#cost function, vectorized regularized logistic regression for theta
    #nn_params=neural network parameters => matrix with all $/Theta$ needs to be reshaped
    #layer=> array with number of elements in every layer without bias
def cost_function(nn_params,layer, X, y, lambda1): #theta, X, y => np.matrix()
    m=X.shape[0] #number of training values
    n=X.shape[1] #number of features
    J,grad=0,np.matrix(np.zeros(n).reshape(n,1))
    L=len(layer)-1#index of last layer
    #checking nn_params
    theta_size=[ [layer[i+1],layer[i]+1] for i in xrange(len(layer)-1)]
    #reshaping nn_params to thetas
    theta=[]#table of thetas
    tx=0
    for i in xrange(len(theta_size)):
        lsize=theta_size[i][0]*theta_size[i][1]
        if i==0:
            lsize-=1
        thetax=np.mat(nn_params[0,tx:tx+lsize+1],dtype=np.float64)
        thetax.resize(theta_size[i][0],theta_size[i][1])
        tx+=lsize+1
        theta.append(thetax)
    #feedforward
    a=[X]#adding bias column, a1 and a2
    for i in xrange(len(theta)):
        a[i]=np.hstack((np.ones((a[i].shape[0],1)),a[i]))
        a.append(sigmoid(a[i]*theta[i].T))
    #calculating cost
    J=-np.sum((np.multiply(y,np.log(a[L]))+np.multiply(1-y,np.log(1-a[L]))).flatten())/m
    J+=lambda1*(sum([np.sum(np.multiply(th,th).flatten()) for th in theta]))/(2*m)#adding regularization
    #backpropagation
    
    delta=y-a[L]
    
    
    grad= X.T*(h_theta-y)/m #gradient function + regularization for j=0 (feature 0)
    grad[1:]+=lambda1*theta[1:]/m #gradient function + regularization for j>0 (feature > 0)
    #backpropagation 
    
    return [J,grad]

gradient checking
$$\frac{\partial}{\partial\Theta} J(\Theta) \approx \frac{J(\theta+\epsilon)-J(\Theta-\epsilon)}{2\epsilon}
\\
\frac{\partial}{\partial\Theta_j} J(\Theta) \approx \frac{J(\Theta_1,\ldots,\Theta_j+\epsilon,\ldots,\Theta_n)-J(\Theta_1,\ldots,\Theta_j-\epsilon,\ldots,\Theta_n)}{2\epsilon}$$

In [4]:
#gradient checking, return the approximated gradient
#theta is a vector np.array.shape=(n,1)
def grad_check(theta,epsilon=10**-4):
    n=theta.size
    for i in xrange(n):
        thetaPlus=theta
        thetaPlus(i)=theta(i)+epsilon
        thetaMinus=theta
        thetaMinus(i)=thetaMinus(i)-epsilon
        gradApprox(i)=(cost_function(thetaPlus)[0]-cost_function(thetaMinus)[0])/(2*epsilon)
    return gradApprox

SyntaxError: can't assign to function call (<ipython-input-4-f35593ff7ac2>, line 7)

In [None]:
#Minimize a function using a nonlinear conjugate gradient algorithm
#------------------------------------------------------------------------
#scipy.optimize.fmin_cg(f, x0, fprime=None, args=(), gtol=1e-05, norm=inf, epsilon=1.4901161193847656e-08, 
#                                                    maxiter=None, full_output=0, disp=1, retall=0, callback=None)
#------------------------------------------------------------------------
#f : callable, f(x, *args)
# Objective function to be minimized. Here x must be a 1-D array of the variables that are to be changed in 
# the search for a minimum, and args are the other (fixed) parameters of f.
#------------------------------------------------------------------------
#x0 : ndarray
# A user-supplied initial estimate of xopt, the optimal value of x. It must be a 1-D array of values.
#------------------------------------------------------------------------
#fprime : callable, fprime(x, *args), optional
# A function that returns the gradient of f at x. Here x and args are as described above for f. 
# The returned value must be a 1-D array. Defaults to None, in which case the gradient is approximated numerically 
# (see epsilon, below).
#------------------------------------------------------------------------
#args : tuple, optional
# Parameter values passed to f and fprime. Must be supplied whenever additional fixed parameters 
# are needed to completely specify the functions f and fprime.
#------------------------------------------------------------------------
#reference => http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.fmin_cg.html#scipy.optimize.fmin_cg

def one_vs_all(X, y, num_labels, lambda1):
    m=X.shape[0] #number of training values
    n=X.shape[1] #number of features
    nk=len(num_labels)#number of classes
    all_theta=np.matrix(np.zeros(nk*(n+1))).reshape(nk,n+1) #nk number of classes, n features + X0 (bias)
    X=np.insert(X,0,1,axis=1)#add column 0 with values 1 (bias)
    for k in num_labels: #for all classes 0-9
        args1=(X, y==k, lambda1)
        initial_theta=np.matrix(np.zeros(n+1)).reshape(n+1,1)
        f1=lambda x,*args: cost_function(x,args[0],args[1],args[2])[0] #minimalize cost function, search theta (x=theta)
        fprime1=lambda x,*args:cost_function(x,args[0],args[1],args[2])[1] #return gradient for given theta (x=theta)
        theta=fmin_cg( f1,x0=initial_theta,fprime=fprime1,args=args1,maxiter=20 )
        #all_theta[k]=theta

    return all_theta

In [None]:
# Predict the label for a trained one-vs-all classifier
def predict_one_vs_all(all_theta, X):
    probability,p=0,0
    return [probability,p]

NEURAL NETWORK STEPS:

1. Pick network architecture:

    -number of hidden features should be the same in each layer, the more the better, more than number of features less than 2*features, default use 1 hidden layer

2. Training a neural network:

    -randomly initialize weigths
    
    -forward propagation
    
    -compute cost function
    
    -backpropagation to compute $$\frac{\partial}{\partial\Theta^{(l)}_{jk}}J(\Theta)$$
    -compute $min_{\Theta} J(\Theta)$

In [1]:
#checking with rand data
lambda1=0.1
num_labels=range(10)
X=np.matrix(np.loadtxt("x"))
y=np.matrix(np.loadtxt("y")).T
m=X.shape[0] #number of training values
n=X.shape[1] #number of features
theta=np.matrix(np.zeros(n+1)).reshape(n+1,1)
#X=np.insert(X,0,1,axis=1)#add column 0 with values 1 (bias) #its made in one-vs-all

#one_vs_all(X, y, num_labels, lambda1)



nk=len(num_labels)#number of classes
all_theta=np.matrix(np.zeros(nk*(n+1))).reshape(nk,n+1) #nk number of classes, n features + X0 (bias)
X=np.insert(X,0,1,axis=1)#add column 0 with values 1 (bias)
for k in num_labels: #for all classes 0-9
    args=(X, y==k, lambda1)
    initial_theta=np.matrix(np.zeros(n+1)).reshape(n+1,1)
    f=lambda x,args: cost_function(x,args[0],args[1],args[2])[0] #minimalize cost function, search theta (x=theta)
    fprime=lambda x,args:cost_function(x,args[0],args[1],args[2])[1] #return gradient for given theta (x=theta)
    #theta=fmin_cg( f1,x0=initial_theta,fprime=fprime1,args=args1,maxiter=20 )
    #all_theta[k]=theta
    if k==2:
        print fprime(initial_theta, args)[10]
        print cost_function(initial_theta,args[0],args[1],args[2])[1][10]
print X.shape,y.shape,theta.shape



NameError: name 'np' is not defined

In [None]:
#test with data from coursera, recognize digits using neural network

#load x,y
a=scipy.io.loadmat("ex4data1.mat")
x=np.asmatrix(a["X"],dtype=np.float64)
y=np.asmatrix(a["y"],dtype=int)
#load theta for 3 layer neural network
a=scipy.io.loadmat("ex4weights.mat")
Theta1=np.asmatrix(a["Theta1"],dtype=np.float64)
Theta2=np.asmatrix(a["Theta2"],dtype=np.float64)
del a
lambda1=0.1

nn_params=np.concatenate((Theta1.flatten(),Theta2.flatten()),axis=1) #convert params to vector 1x [Theta1.size+Theta2.size]
layer=[x.shape[1], 25,  10]
#change y to vector for example for y=1 [0 1 0 0 0 0 0 0 0 0]
y2=np.zeros((5000,10),dtype=np.float64)#5000 examples 10 numbers
for i in xrange(5000):# 0 as 10, indices -1 (data prepared for octave)
    numb=y[i]-1
    y2[i,numb]=1
        
cost_function(nn_params,layer, X, y, lambda1): #theta, X, y => np.matrix()
