#Introduction to Neural Networks

**Goal** : This notebook explains the building blocks of a neural network model. 

**Data** : We would be using Kaggle's Otto Product classification data https://www.kaggle.com/c/otto-group-product-classification-challenge

In [None]:
import numpy as np
import scipy
import pandas as pd

###Load the training and test datasets

In [None]:
train_pd = pd.read_csv("../data/otto/train.csv")
test_pd = pd.read_csv("../data/otto/test.csv")

In [None]:
train_pd.shape

In [None]:
test_pd.shape

In [None]:
train_pd.head()

In [None]:
train_pd.describe()

In [None]:
train_pd.target.unique()

In [None]:
train_pd.target.value_counts()

###One-hot encoding the target variable. We will use the inbuilt `preprocessing` module from `sklearn`

In [None]:
from sklearn import preprocessing

In [None]:
labels = train_pd.target.values

enc = preprocessing.LabelBinarizer()
binarized_labels = enc.fit_transform(labels)

In [None]:
binarized_labels[0:10]

In [None]:
target = binarized_labels[:,1]
train_pd = train_pd.drop("id", axis=1)
train_pd = train_pd.drop("target", axis=1)

In [None]:
X = np.array(train_pd.ix[1:10000,:30])
y = np.array(target[:10000])

In [None]:
print X.shape
print y.shape

In [None]:
import math
import random
import string

In [None]:
random.seed(0)

###Function to generate a random number, given two numbers

In [None]:
# calculate a random number where:  a <= rand < b
def rand(a, b):
    return (b-a)*random.random() + a

In [None]:
# Make a matrix 
def makeMatrix(I, J, fill=0.0):
    return np.zeros([I,J])

###Define our activation function. We will be using sigmoid function

In [None]:
# our sigmoid function
def sigmoid(x):
    #return math.tanh(x)
    return 1/(1+np.exp(-x))

###Derivative of our activation function. We need this when we run the backpropagation algorithm

In [None]:
# derivative of our sigmoid function, in terms of the output (i.e. y)
def dsigmoid(y):
    return 1.0 - y**2

###Our neural networks class

When we first create a neural networks architecture, we need to know the number of inputs, number of hidden layers and number of outputs.

The weights have to be randomly initialized.

In [None]:
class NN:
    def __init__(self, ni, nh, no):
        # number of input, hidden, and output nodes
        self.ni = ni + 1 # +1 for bias node
        self.nh = nh
        self.no = no

        # activations for nodes
        self.ai = [1.0]*self.ni
        self.ah = [1.0]*self.nh
        self.ao = [1.0]*self.no
        
        # create weights
        self.wi = makeMatrix(self.ni, self.nh)
        self.wo = makeMatrix(self.nh, self.no)
        
        # set them to random vaules
        for i in range(self.ni):
            for j in range(self.nh):
                self.wi[i][j] = rand(-0.2, 0.2)
        for j in range(self.nh):
            for k in range(self.no):
                self.wo[j][k] = rand(-2.0, 2.0)

        # last change in weights for momentum   
        self.ci = makeMatrix(self.ni, self.nh)
        self.co = makeMatrix(self.nh, self.no)

###Function for Backpropagation Algorithm

After the forward-pass, we need to compute the error for the output. The error is backpropagated to the layers before it. The weights are adjusted, based on how much they contributed to the prediction error. 

In [None]:
def backPropagate(self, targets, N, M):
        
    if len(targets) != self.no:
        print targets
        raise ValueError('wrong number of target values')

    # calculate error terms for output
    #output_deltas = [0.0] * self.no
    output_deltas = np.zeros(self.no)
    for k in range(self.no):
        error = targets[k]-self.ao[k]
        output_deltas[k] = dsigmoid(self.ao[k]) * error

    # calculate error terms for hidden
        
    #hidden_deltas = [0.0] * self.nh
    hidden_deltas = np.zeros(self.nh)
    for j in range(self.nh):
        error = 0.0
        for k in range(self.no):
            error = error + output_deltas[k]*self.wo[j][k]
        hidden_deltas[j] = dsigmoid(self.ah[j]) * error

    # update output weights
    for j in range(self.nh):
        for k in range(self.no):
            change = output_deltas[k]*self.ah[j]
            self.wo[j][k] = self.wo[j][k] + N*change + M*self.co[j][k]
            self.co[j][k] = change
            #print N*change, M*self.co[j][k]

    # update input weights
    for i in range(self.ni):
        for j in range(self.nh):
            change = hidden_deltas[j]*self.ai[i]
            self.wi[i][j] = self.wi[i][j] + N*change + M*self.ci[i][j]
            self.ci[i][j] = change

    # calculate error
    error = 0.0
    for k in range(len(targets)):
        error = error + 0.5*(targets[k]-self.ao[k])**2
    return error

##Now, the full Neural Networks class

In [None]:
class NN:
    def __init__(self, ni, nh, no):
        # number of input, hidden, and output nodes
        self.ni = ni + 1 # +1 for bias node
        self.nh = nh
        self.no = no

        # activations for nodes
        self.ai = [1.0]*self.ni
        self.ah = [1.0]*self.nh
        self.ao = [1.0]*self.no
        
        # create weights
        self.wi = makeMatrix(self.ni, self.nh)
        self.wo = makeMatrix(self.nh, self.no)
        
        # set them to random vaules
        for i in range(self.ni):
            for j in range(self.nh):
                self.wi[i][j] = rand(-0.2, 0.2)
        for j in range(self.nh):
            for k in range(self.no):
                self.wo[j][k] = rand(-2.0, 2.0)

        # last change in weights for momentum   
        self.ci = makeMatrix(self.ni, self.nh)
        self.co = makeMatrix(self.nh, self.no)
        

    def backPropagate(self, targets, N, M):
        
        if len(targets) != self.no:
            print targets
            raise ValueError('wrong number of target values')

        # calculate error terms for output
        #output_deltas = [0.0] * self.no
        output_deltas = np.zeros(self.no)
        for k in range(self.no):
            error = targets[k]-self.ao[k]
            output_deltas[k] = dsigmoid(self.ao[k]) * error

        # calculate error terms for hidden
        
        #hidden_deltas = [0.0] * self.nh
        hidden_deltas = np.zeros(self.nh)
        for j in range(self.nh):
            error = 0.0
            for k in range(self.no):
                error = error + output_deltas[k]*self.wo[j][k]
            hidden_deltas[j] = dsigmoid(self.ah[j]) * error

        # update output weights
        for j in range(self.nh):
            for k in range(self.no):
                change = output_deltas[k]*self.ah[j]
                self.wo[j][k] = self.wo[j][k] + N*change + M*self.co[j][k]
                self.co[j][k] = change
                #print N*change, M*self.co[j][k]

        # update input weights
        for i in range(self.ni):
            for j in range(self.nh):
                change = hidden_deltas[j]*self.ai[i]
                self.wi[i][j] = self.wi[i][j] + N*change + M*self.ci[i][j]
                self.ci[i][j] = change

        # calculate error
        error = 0.0
        for k in range(len(targets)):
            error = error + 0.5*(targets[k]-self.ao[k])**2
        return error


    def test(self, patterns):
        self.predict = np.empty([len(patterns), self.no])
        for i, p in enumerate(patterns):
            self.predict[i] = self.activate(p[0])

    def weights(self):
        print('Input weights:')
        for i in range(self.ni):
            print(self.wi[i])
        
        print('Output weights:')
        for j in range(self.nh):
            print(self.wo[j])
            
    def activate(self, inputs):
        
        if len(inputs) != self.ni-1:
            print inputs
            raise ValueError('wrong number of inputs')

        # input activations
        for i in range(self.ni-1):
            #self.ai[i] = sigmoid(inputs[i])
            self.ai[i] = inputs[i]

        # hidden activations
        for j in range(self.nh):
            sum = 0.0
            for i in range(self.ni):
                sum = sum + self.ai[i] * self.wi[i][j]
            self.ah[j] = sigmoid(sum)

        # output activations
        for k in range(self.no):
            sum = 0.0
            for j in range(self.nh):
                sum = sum + self.ah[j] * self.wo[j][k]
            self.ao[k] = sigmoid(sum)
        
        return self.ao[:]
    

    def train(self, patterns, iterations=1000, N=0.5, M=0.1):
        # N: learning rate
        # M: momentum factor
        for i in range(iterations):
            error = 0.0
            for p in patterns:
                inputs = p[0]
                targets = p[1]
                #print inputs
                #print targets
                self.activate(inputs)
                error = error + self.backPropagate([targets], N, M)
            if i % 100 == 0:
                print('error %-.5f' % error)

##Running the model on the Otto data

In [None]:
# create a network with two input, two hidden, and one output nodes

# 50 iterations take ~17 mins to train
#n = NN(30, 50, 1)

n = NN(30, 10, 1)
# train it with some patterns
%timeit -n 1 -r 1 n.train(zip(X,y), iterations=10)
# test it
%timeit -n 1 -r 1  n.test(zip(X,y))

##Predict on the training dataset

In [None]:
predict = np.ravel(n.predict)
pd.DataFrame(data=np.array([y,predict]).T, columns=["actual", "prediction"])

In [None]:
predict = np.ravel(n.predict)
pd.DataFrame(data=np.array([y,predict]).T, columns=["actual", "prediction"])