# Redes Neurais
## Preâmbulo

In [1]:
%matplotlib inline
import matplotlib.pyplot as plot
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

import time
import numpy as np
import numpy.random as nr
import pandas as pd
import tensorflow as tf

from keras.models import Sequential
from keras.layers.core import Dense, Flatten
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, TensorBoard
from keras.utils import np_utils

np.set_printoptions(precision=3, linewidth=100, suppress=True)

Using TensorFlow backend.


## Dataset

In [2]:
# -----------------------------------------------------------------------------------------------------------
# Wine Quality Data Set
# ---------------------
# [https://archive.ics.uci.edu/ml/datasets/Wine+Quality]
#
# 1. Title: Wine Quality 

# 2. Sources
#    Created by: Paulo Cortez (Univ. Minho), Antonio Cerdeira, Fernando Almeida, Telmo Matos and Jose Reis (CVRVV) @ 2009
   
# 3. Past Usage:

#   P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. 
#   Modeling wine preferences by data mining from physicochemical properties.
#   In Decision Support Systems, Elsevier, 47(4):547-553. ISSN: 0167-9236.

#   In the above reference, two datasets were created, using red and white wine samples.
#   The inputs include objective tests (e.g. PH values) and the output is based on sensory data
#   (median of at least 3 evaluations made by wine experts). Each expert graded the wine quality 
#   between 0 (very bad) and 10 (very excellent). Several data mining methods were applied to model
#   these datasets under a regression approach. The support vector machine model achieved the
#   best results. Several metrics were computed: MAD, confusion matrix for a fixed error tolerance (T),
#   etc. Also, we plot the relative importances of the input variables (as measured by a sensitivity
#   analysis procedure).
 
# 4. Relevant Information:

#    The two datasets are related to red and white variants of the Portuguese "Vinho Verde" wine.
#    For more details, consult: http://www.vinhoverde.pt/en/ or the reference [Cortez et al., 2009].
#    Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables 
#    are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).

#    These datasets can be viewed as classification or regression tasks.
#    The classes are ordered and not balanced (e.g. there are munch more normal wines than
#    excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent
#    or poor wines. Also, we are not sure if all input variables are relevant. So
#    it could be interesting to test feature selection methods. 

# 5. Number of Instances: red wine - 1599; white wine - 4898. 

# 6. Number of Attributes: 11 + output attribute
  
#    Note: several of the attributes may be correlated, thus it makes sense to apply some sort of
#    feature selection.

# 7. Attribute information:

#    For more information, read [Cortez et al., 2009].

#    Input variables (based on physicochemical tests):
#    1 - fixed acidity
#    2 - volatile acidity
#    3 - citric acid
#    4 - residual sugar
#    5 - chlorides
#    6 - free sulfur dioxide
#    7 - total sulfur dioxide
#    8 - density
#    9 - pH
#    10 - sulphates
#    11 - alcohol
#    Output variable (based on sensory data): 
#    12 - quality (score between 0 and 10)

# 8. Missing Attribute Values: None
# -----------------------------------------------------------------------------------------------------------
import os
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

wine_data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-{}.csv"
wine_data_file = '../data/winequality-{}.csv'

for red_or_white in ['red', 'white']:
    if not os.path.isfile(wine_data_file.format(red_or_white)):
        # Read wine quality data from UCI website
        req = requests.get(wine_data_url.format(red_or_white))
        open(wine_data_file.format(red_or_white), 'w').write(req.text)

df_white = pd.read_csv("../data/winequality-white.csv", sep=';')
dfw = df_white.as_matrix()

scaler = StandardScaler()
X = scaler.fit_transform(dfw[:,:-1])
y = dfw[:,-1]

m, n = X.shape

# X = np.hstack((np.ones((m, 1)), X))

Xtra, Xval, ytra, yval = train_test_split(X, y.reshape(-1, 1), train_size=0.8, random_state=20170421)

print('Shapes:', Xtra.shape, Xval.shape, ytra.shape, yval.shape)
print('Target: min={:.3f}, mean={:.3f}, max={:.3f}'.format(ytra.min(), ytra.mean(), ytra.max()))

Shapes: (3918, 11) (980, 11) (3918, 1) (980, 1)
Target: min=3.000, mean=5.878, max=9.000


\begin{align*} 
\mathbf{X}_{train} & = \begin{bmatrix}
1 & \longleftarrow & (\mathbf{x}^{(0)})^T & \longrightarrow \\ 
1 & \longleftarrow & (\mathbf{x}^{(1)})^T & \longrightarrow \\ 
 &  & \vdots  & \\ 
1 & \longleftarrow & (\mathbf{x}^{(m-1)})^T & \longrightarrow 
\end{bmatrix} = \begin{bmatrix}
1 & x_0^{(0)} & x_1^{(0)} & x_2^{(0)} & \ldots & x_{11}^{(0)} \\
1 & x_0^{(1)} & x_1^{(1)} & x_2^{(1)} & \ldots & x_{11}^{(1)} \\
 &  & \vdots  & \\ 
1 & x_0^{(3917)} & x_1^{(3917)} & x_2^{(3917)} & \ldots & x_{11}^{(3917)}
\end{bmatrix} \\
\\
\mathbf{y}_{train} & = \begin{bmatrix}
y^{(0)} \\
y^{(1)} \\
\vdots  \\ 
y^{(3917)} \\
\end{bmatrix}
\end{align*}


## Implementação matricial

Nesta implementação da rede neural, o *bias* está incluido na matriz de pesos. A matriz de amostras é aumentada de forma a incluir uma primeira coluna contendo 1s. 

### Equações para regressão linear
Considerando uma amostra apenas (SGD).

#### Para a frente:

\begin{align*} 
\sigma(\mathbf{z}) & = \frac{1}{1+e^{-\mathbf{z}}} \\
\\
\mathbf{a}_{(i)} & = \begin{cases}
\mathbf{x}  & \text{ if } i = 0 \\
\\
\sigma(\mathbf{W}_{(i)} \cdot \mathbf{a}_{(i-1)}) & \text{ if } 1 \leq i \leq L-1 \\
\\
\mathbf{W}_{(i)} \cdot \mathbf{a}_{(i-1)} & \text{ if } i = L 
\end{cases}
\\
\\
J & = \frac{1}{2} (\mathbf{a}_{(L)} - \mathbf{y})^T \cdot (\mathbf{a}_{(L)} - \mathbf{y})
\end{align*}

#### Para trás:

\begin{align*} 
\boldsymbol{\delta}_{(i)} & = \begin{cases}
\mathbf{a}_{(i)} - \mathbf{y} & \text{ if } i = L \\
\\
\mathbf{W}_{(i+1)} \cdot \boldsymbol{\delta}_{(i+1)} \circ \mathbf{a}_{(i)} (1 - \mathbf{a}_{(i)}) & \text{ if } 1 \leq i \leq L-1 
\end{cases}
\\
\\
\mathbf{W}_{(i)} & = \mathbf{W}_{(i)} - \eta \boldsymbol{\delta}_{(i)} \cdot \mathbf{a}_{(i-1)}^{T}
\end{align*}


### Equações para regressão logística
Considerando uma amostra apenas (SGD).

#### Para a frente:

\begin{align*} 
\sigma(\mathbf{z}) & = \frac{1}{1+e^{-\mathbf{z}}} \\
\\
\mathbf{a}_{(i)} & = \begin{cases}
\mathbf{x}  & \text{ if } i = 0 \\
\\
\sigma(\mathbf{W}_{(i)} \cdot \mathbf{a}_{(i-1)}) & \text{ if } 1 \leq i \leq L \\
\end{cases}
\\
\\
J & = - \sum \left (\mathbf{y} \log{(\mathbf{a}_{(L)})} + (1 - \mathbf{y}) \log{(1 - \mathbf{a}_{(L)})} \right )
\end{align*}

#### Para trás:

\begin{align*} 
\boldsymbol{\delta}_{(i)} & = \begin{cases}
\mathbf{a}_{(i)} - \mathbf{y} & \text{ if } i = L \\
\\
\mathbf{W}_{(i+1)} \cdot \boldsymbol{\delta}_{(i+1)} \circ \mathbf{a}_{(i)} (1 - \mathbf{a}_{(i)}) & \text{ if } 1 \leq i \leq L-1 
\end{cases}
\\
\\
\mathbf{W}_{(i)} & = \mathbf{W}_{(i)} - \eta \boldsymbol{\delta}_{(i)} \cdot \mathbf{a}_{(i-1)}^{T}
\end{align*}


### O código

In [3]:
class BackPropNeuralNetwork:
    
    def __init__(self, layer_sizes=[], is_classifier=True):
        self.L = len(layer_sizes)
        self.s = layer_sizes
        self.W = None
        self.classifier = is_classifier

    def init_weights(self, epsilon=None):
        from numpy.random import rand
        self.W = []
        for i in range(self.L-1):
            if epsilon is None:
                eps = np.sqrt(6.0 / (self.s[i] + self.s[i+1]))
            else:
                eps = epsilon
            self.W.append(2*eps*rand(self.s[i+1], self.s[i]+1) - eps)

    def sgd(self, X, y, lr, batch, nepochs):
        r = nepochs / 10
        m, n = X.shape
        n_batches = int(np.ceil(m / batch))
        
        for epoch in range(nepochs):
            for ii in range(n_batches):
                kk = batch * ii
                X_batch, y_batch = X[kk:kk+batch], y[kk:kk+batch]
                
                cost, G = self.compute_cost_and_gradient(self.W, X, y)
                for i in range(len(self.W)):
                    self.W[i] -= lr * G[i]
                    
            if not epoch % r:
                print(('{:4d} Cost: {:.5f}'.format(epoch, cost)))
        print(('{:4d} Cost: {:.5f}'.format(epoch, cost)))
    
    def predict(self, X):
        a = self.compute_activations(self.W, X)
        return a[-1]
    
    def compute_cost_and_gradient(self, W, X, y):
        M, N  = X.shape
        # Gradients
        G = [None for n in self.s[:-1]]
        # Forward propagation
        a = self.compute_activations(W, X)
        # Back propagation
        d = self.compute_errors(W, a, y)
        # Cost computation
        if self.classifier:
            # classifier: binary cross-entropy
            J = - (y * np.log(a[-1]) + (1 - y) * np.log(1 - a[-1])).sum() / M
        else:
            # regressor: mean squared error
            J = 0.5 * np.square(y - a[-1]).sum() / M
        for j in range(self.L-1):
            # gradient computation
            G[j] = np.dot(d[j+1].T, a[j]) / M
        return J, G

    def compute_activations(self, W, X):
        a = [None for n in self.s]
        a[0] = X
        for j in range(1, self.L):
            a[j-1] = np.insert(a[j-1], 0, 1, 1)
            z = np.dot(a[j-1], W[j-1].T)
            if j == self.L-1 and not self.classifier:
                a[j] = z
            else:
                a[j] = self.logistic(z)
        return a            

    def compute_errors(self, W, a, y):
        d = [None for n in self.s]
        d[-1] = a[-1] - y
        for j in range(self.L-2, 0, -1):
            d[j] = np.dot(d[j+1], W[j]) * a[j] * (1 - a[j])
            d[j] = d[j][:,1:]
        return d

    @staticmethod
    def logistic(z):
        z = np.asarray(z)
        z = np.minimum(z,  15)
        z = np.maximum(z, -15)
        return np.ones(z.shape)/(1.0 + np.exp(-z))


### Teste com gradiente descendente

In [4]:
layer_sizes = [n, 40, 20, 1]
nepochs = 1000

nnet = BackPropNeuralNetwork(layer_sizes, is_classifier=False)
nnet.init_weights();

try:
    t0 = time.time()
    nnet.sgd(Xtra, ytra, 0.2, 4128, nepochs)
    t1 = time.time()
except KeyboardInterrupt:
    pass

print('\nTrained in {:2f}s'.format(t1-t0))
yhat = nnet.predict(Xval)
mse = np.square(yhat - yval).mean()
print('\nMSE:', mse)
print()
print('y_hat', yhat[:10,0])
print('y_val', yval[:10,0])
print()
acc = 100.0 * np.where(np.abs(yhat - yval) < 0.5, 1, 0).sum() / yval.shape[0]
print('Accuracy: {:.2f}%'.format(acc))

   0 Cost: 19.81936
 100 Cost: 0.29999
 200 Cost: 0.28705
 300 Cost: 0.28495
 400 Cost: 0.28396
 500 Cost: 0.28314
 600 Cost: 0.28233
 700 Cost: 0.28149
 800 Cost: 0.28061
 900 Cost: 0.27968
 999 Cost: 0.27869

Trained in 16.796289s

MSE: 0.536059832829

y_hat [ 5.641  6.569  6.229  5.35   6.035  5.511  6.47   5.11   6.244  6.014]
y_val [ 6.  6.  7.  5.  6.  8.  7.  5.  8.  6.]

Accuracy: 56.12%


## Keras

In [5]:
estop = EarlyStopping(patience=200, verbose=1)

def build():
    model = Sequential()
    model.add(Dense(layer_sizes[1], activation='sigmoid', input_shape=(layer_sizes[0],)))
    model.add(Dense(layer_sizes[2], activation='sigmoid'))
    model.add(Dense(layer_sizes[3], activation=None))
    return model

nnet = build()
nnet.summary()

opt = SGD(lr=0.1, momentum=0.9)
nnet.compile(loss="mean_squared_error", optimizer=opt)    

try:
    t0 = time.time()
    histo2 = nnet.fit(Xtra, ytra, batch_size=516, epochs=1000, verbose=0, 
                      validation_data=(Xval, yval), callbacks=[estop])
    t1 = time.time()
except KeyboardInterrupt:
    pass

print('\nTrained in {:2f}s'.format(t1-t0))
print()
yhat = nnet.predict(Xval, verbose=0)
mse = np.square(yhat - yval).mean()
print('MSE:', mse)
print()
print('y_hat', yhat[:10,0])
print('y_val', yval[:10,0])
print()
acc = 100.0 * np.where(np.abs(yhat - yval) < 0.5, 1, 0).sum() / yval.shape[0]
print('Accuracy: {:.2f}%'.format(acc))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 40)                480       
_________________________________________________________________
dense_2 (Dense)              (None, 20)                820       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 21        
Total params: 1,321
Trainable params: 1,321
Non-trainable params: 0
_________________________________________________________________
Epoch 00312: early stopping

Trained in 3.534593s

MSE: 0.520362105834

y_hat [ 5.811  6.795  6.884  5.403  5.63   6.778  6.96   5.481  6.318  6.481]
y_val [ 6.  6.  7.  5.  6.  8.  7.  5.  8.  6.]

Accuracy: 54.69%


In [6]:
yy = np.vstack((ytra, yval)).astype(np.int32)
ymin, ymax = yy.min(), yy.max()
ncat = ymax - ymin + 1

ytra_cat = np_utils.to_categorical(ytra.astype(np.int32) - ymin, ncat)
yval_cat = np_utils.to_categorical(yval.astype(np.int32) - ymin, ncat)

estop = EarlyStopping(patience=200, verbose=1)
tb = TensorBoard(log_dir='../logs')

def build():
    model = Sequential()
    model.add(Dense(40, activation='relu', input_shape=(layer_sizes[0] - 1,)))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(ncat, activation='softmax'))
    return model

nnet = build()
nnet.summary()

opt = SGD(lr=0.1, momentum=0.9)
nnet.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])    

Xt = Xtra[:, 1:]
Xv = Xval[:, 1:]

try:
    t0 = time.time()
    histo2 = nnet.fit(Xtra[:,1:], ytra_cat, batch_size=516, epochs=1000, verbose=0, 
                      validation_data=(Xval[:,1:], yval_cat), callbacks=[estop, tb])
    t1 = time.time()
except KeyboardInterrupt:
    pass

print('\nTrained in {:2f}s'.format(t1-t0))
loss, acc = nnet.evaluate(Xv, yval_cat)
print('\n\nLoss     = {:.5f}\nAccuracy = {:.3f}'.format(loss, 100*acc))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 40)                440       
_________________________________________________________________
dense_5 (Dense)              (None, 20)                820       
_________________________________________________________________
dense_6 (Dense)              (None, 7)                 147       
Total params: 1,407
Trainable params: 1,407
Non-trainable params: 0
_________________________________________________________________
Epoch 00233: early stopping

Trained in 5.114188s
 32/980 [..............................] - ETA: 0s

Loss     = 1.23137
Accuracy = 56.633
