# CS 155 Miniproject 1

In [32]:
import matplotlib.pyplot as plt
import numpy as np

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization
from keras import regularizers

# For the AUC metric
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification
from keras.utils import np_utils
from keras.callbacks import Callback, EarlyStopping

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [33]:
# define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

Load the data and divide it into training and validation sets:

In [None]:
X = load_data("train_2008.csv")
N = len(X)

data = X[:, 3:-1]
label = X[:, -1]

train_percent = 70.
train_size = int(N * train_percent / 100)

x_train = data[0:train_size]
y_train = label[0:train_size]
x_validation = data[train_size:]
y_validation = label[train_size:]

In [None]:
# print(N)
# print(train_size)
print(np.shape(X))
# print(X)

Normalize the Data

In [None]:
print(x_train)

In [None]:
# One-hot encode the labels.
y_train = keras.utils.np_utils.to_categorical(y_train)
y_validation = keras.utils.np_utils.to_categorical(y_validation)

class_weight = {0: np.sum(y_train[:,0]),
                1: np.sum(y_train[:,1])}

print(class_weight)

In [None]:
y_train

In [8]:
# don't forget to NORMALIZE
train_mean_array = np.zeros(len(x_train[0]))
train_std_array = np.zeros(len(x_train[0]))
std_nonzero_indices = []
for j in range(len(x_train[0])):
    train_mean_array[j] = np.mean(x_train[:,j])
    train_std_array[j] = np.std(x_train[:,j])
    if train_std_array[j] != 0:
        std_nonzero_indices.append(j)
        x_train[:,j] = \
            np.divide(x_train[:,j] - train_mean_array[j],
                      train_std_array[j])
    if np.std(x_validation[:,j]) != 0:
        x_validation[:,j] = \
            np.divide(x_validation[:,j] - np.mean(x_validation[:,j]),
                      np.std(x_validation[:,j]))

  del sys.path[0]


In [9]:
x_train = x_train[:, std_nonzero_indices]
x_validation = x_validation[:, std_nonzero_indices]

In [10]:
print(x_train)

[[-0.61795939  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939 -2.60061808 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 ...
 [ 0.57602003  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939 -2.60061808 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]]


In [11]:
print(y_train)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [12]:
## Create the model here given the constraints in the problem.
model = Sequential()
model.add(Dense(1000))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(1000))
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('softmax'))

In [13]:
# my_callbacks = [EarlyStopping(monitor='auc_roc', patience=300, verbose=1, mode='max')]

In [14]:
# For a multi-class classification problem
model.compile(loss='mse',
               optimizer='rmsprop', metrics=['accuracy', auc_roc])

In [15]:
# fit = model.fit(x_train, y_train, batch_size=64, epochs=20,
#     verbose=1, callbacks=my_callbacks)
fit = model.fit(x_train, y_train, batch_size=64, epochs=5,
    verbose=1, class_weight=class_weight)

Instructions for updating:
Please switch to tf.metrics.auc. Note that the order of the labels and predictions arguments has been switched.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
# why don't we take a look at the layers and outputs
# note: `None` in the first dimension means it can take any batch_size!
for i in range(len(model.layers)):
    layer = model.layers[i]
    print(layer)
    print(layer.output_shape)

<keras.layers.core.Dense object at 0x000002208E205860>
(None, 1000)
<keras.layers.core.Activation object at 0x000002208E205518>
(None, 1000)
<keras.layers.core.Dropout object at 0x000002208E205E80>
(None, 1000)
<keras.layers.core.Dense object at 0x000002208E205D30>
(None, 500)
<keras.layers.core.Activation object at 0x000002208E2059E8>
(None, 500)
<keras.layers.core.Dropout object at 0x000002208E205F28>
(None, 500)
<keras.layers.core.Dense object at 0x000002208E205F60>
(None, 1000)
<keras.layers.core.Activation object at 0x000002208E205F98>
(None, 1000)
<keras.layers.core.Dense object at 0x000002208E205C88>
(None, 2)
<keras.layers.core.Activation object at 0x000002208E205470>
(None, 2)


In [17]:
# our model has some # of parameters:
model.count_params()

1370502

In [18]:
## Printing a summary of the layers and weights in the model.
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1000)              367000    
_________________________________________________________________
activation_1 (Activation)    (None, 1000)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 500)               500500    
_________________________________________________________________
activation_2 (Activation)    (None, 500)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1000)              501000    
__________

In [19]:
# note that our model outputs two eval params:
# 1. loss (categorical cross-entropy)
# 2. accuracy
model.metrics_names

['loss', 'acc', 'auc_roc']

In [20]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
train_score = model.evaluate(x=x_train, y=y_train, verbose=0)
print('Train score:', train_score[0])
print('Train accuracy:', train_score[1])
print('Train AUC:', train_score[2])

Train score: 0.17230758151070838
Train accuracy: 0.7649670834543006
Train AUC: 0.8125230741291226


In [21]:
y_output_train = model.predict(x_train, batch_size=None, verbose=0, steps=None)

In [22]:
for i in range(len(y_output_train)):
    y_output_train[i] = [i, y_output_train[i][1]]
np.savetxt("2008_train_output.csv", y_output_train)

In [23]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
validation_score = model.evaluate(x=x_validation, y=y_validation, verbose=0)
print('Validation score:', validation_score[0])
print('Validation accuracy:', validation_score[1])
print('Validation AUC:', validation_score[2])

Validation score: 0.2283090417851837
Validation accuracy: 0.7401164888469259
Validation AUC: 0.8120945927550034


# Test Output

In [24]:
X_test = load_data("test_2008.csv")
ids = X_test[:,0]

x_test = X[:, 3:-1]
y_test = X[:, -1]

In [25]:
y_test = keras.utils.np_utils.to_categorical(y_test)

In [26]:
# don't forget to NORMALIZE
# std_nonzero_indices = []
for j in range(len(x_test[0])):
    test_std = np.std(x_test[:,j])
    if test_std != 0:
        # std_nonzero_indices.append(j)
        x_test[:,j] = \
            np.divide(x_test[:,j] - np.mean(x_test[:,j]),
                      np.std(x_test[:,j]))

In [27]:
x_test = x_test[:, std_nonzero_indices]

In [28]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
test_score = model.evaluate(x=x_test, y=y_test, verbose=0)
print('Test score:', test_score[0])
print('Test accuracy:', test_score[1])
print('Test AUC:', test_score[2])

Test score: 0.22785497065347515
Test accuracy: 0.7446147184799109
Test AUC: 0.7986823468794377


In [29]:
help(model.predict)

Help on method predict in module keras.engine.training:

predict(x, batch_size=None, verbose=0, steps=None) method of keras.engine.sequential.Sequential instance
    Generates output predictions for the input samples.
    
    Computation is done in batches.
    
    # Arguments
        x: The input data, as a Numpy array
            (or list of Numpy arrays if the model has multiple inputs).
        batch_size: Integer. If unspecified, it will default to 32.
        verbose: Verbosity mode, 0 or 1.
        steps: Total number of steps (batches of samples)
            before declaring the prediction round finished.
            Ignored with the default value of `None`.
    
    # Returns
        Numpy array(s) of predictions.
    
    # Raises
        ValueError: In case of mismatch between the provided
            input data and the model's expectations,
            or in case a stateful model receives a number of samples
            that is not a multiple of the batch size.



In [30]:
y_output = model.predict(x_test, batch_size=None, verbose=0, steps=None)

In [31]:
for i in range(len(y_output)):
    y_output[i] = [i, y_output[i][1]]
np.savetxt("2008_submission.csv", y_output)