# CS 155 Miniproject 1

In [92]:
import matplotlib.pyplot as plt
import numpy as np

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization
from keras import regularizers

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

Load the data and divide it into training and validation sets:

In [93]:
X = load_data("train_2008.csv")
N = len(X)

data = X[:, 3:-1]
label = X[:, -1]

train_percent = 70.
train_size = int(N * train_percent / 100)

x_train = data[0:train_size]
y_train = label[0:train_size]
x_test = data[train_size:]
y_test = label[train_size:]

In [94]:
# print(N)
# print(train_size)
print(np.shape(X))
# print(X)

(64667, 383)


Normalize the Data

In [95]:
print(x_train)

[[  1. 201.   0. ...   0.   0.   0.]
 [  1. 201.   0. ...   0.   0.   0.]
 [  1.   1.   0. ...   0.   0.   0.]
 ...
 [  2. 201.   0. ...   0.   0.   0.]
 [  1. 201.   0. ...   0.   0.   0.]
 [  1.   1.   0. ...   0.   0.   0.]]


In [96]:
# One-hot encode the labels.
y_train = keras.utils.np_utils.to_categorical(y_train)
y_test = keras.utils.np_utils.to_categorical(y_test)

In [97]:
# don't forget to NORMALIZE
train_mean_array = np.zeros(len(x_train[0]))
train_std_array = np.zeros(len(x_train[0]))
for j in range(len(x_train[0])):
    train_mean_array[j] = np.mean(x_train[:,j])
    train_std_array[j] = np.std(x_train[:,j])
    x_train[:,j] = np.divide(x_train[:,j] - train_mean_array[j], train_std_array[j])
    x_test[:,j] = np.divide(x_test[:,j] - np.mean(x_test[:,j]), np.std(x_test[:,j])) # use train metrics here?

  import sys
  


In [98]:
print(x_train)

[[-0.61795939  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939 -2.60061808 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 ...
 [ 0.57602003  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939 -2.60061808 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]]


In [99]:
print(y_train)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [100]:
## Create the model here given the constraints in the problem.
model = Sequential()
model.add(Dense(400))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(400))
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('softmax'))

In [101]:
# For a multi-class classification problem
model.compile(loss='mse',
               optimizer='rmsprop', metrics=['accuracy'])

In [102]:
fit = model.fit(x_train, y_train, batch_size=64, epochs=20,
    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [103]:
# why don't we take a look at the layers and outputs
# note: `None` in the first dimension means it can take any batch_size!
for i in range(len(model.layers)):
    layer = model.layers[i]
    print(layer)
    print(layer.output_shape)

<keras.layers.core.Dense object at 0x0000021793D0DE48>
(None, 400)
<keras.layers.core.Activation object at 0x0000021793D0DC88>
(None, 400)
<keras.layers.core.Dropout object at 0x0000021793D0D898>
(None, 400)
<keras.layers.core.Dense object at 0x0000021793D0D978>
(None, 200)
<keras.layers.core.Activation object at 0x0000021793D0D9B0>
(None, 200)
<keras.layers.core.Dropout object at 0x0000021793D0DA20>
(None, 200)
<keras.layers.core.Dense object at 0x0000021793D0DB00>
(None, 400)
<keras.layers.core.Activation object at 0x0000021793D0D780>
(None, 400)
<keras.layers.core.Dense object at 0x0000021793D0D710>
(None, 2)
<keras.layers.core.Activation object at 0x0000021793D0D828>
(None, 2)


In [104]:
# our model has some # of parameters:
model.count_params()

313402

In [105]:
## Printing a summary of the layers and weights in the model.
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 400)               152000    
_________________________________________________________________
activation_17 (Activation)   (None, 400)               0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 200)               80200     
_________________________________________________________________
activation_18 (Activation)   (None, 200)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 400)               80400     
__________

In [106]:
# note that our model outputs two eval params:
# 1. loss (categorical cross-entropy)
# 2. accuracy
model.metrics_names

['loss', 'acc']

In [107]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
score = model.evaluate(x=x_train, y=y_train, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.1892175907183119
Test accuracy: 0.7465426589519553


In [108]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
score = model.evaluate(x=x_test, y=y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.19237427000016086
Test accuracy: 0.7401164888469259
