# CS 155 Miniproject 1

In [408]:
import matplotlib.pyplot as plt
import numpy as np

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization
from keras import regularizers

# For the AUC metric
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification
from keras.utils import np_utils
from keras.callbacks import Callback, EarlyStopping

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [409]:
# define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

Load the data and divide it into training and validation sets:

In [410]:
X = load_data("train_2008.csv")
N = len(X)

data = X[:, 3:-1]
label = X[:, -1]

train_percent = 70.
train_size = int(N * train_percent / 100)

x_train = data[0:train_size]
y_train = label[0:train_size]
x_validation = data[train_size:]
y_validation = label[train_size:]

In [411]:
# print(N)
# print(train_size)
print(np.shape(X))
# print(X)

(64667, 383)


Normalize the Data

In [412]:
print(x_train)

[[  1. 201.   0. ...   0.   0.   0.]
 [  1. 201.   0. ...   0.   0.   0.]
 [  1.   1.   0. ...   0.   0.   0.]
 ...
 [  2. 201.   0. ...   0.   0.   0.]
 [  1. 201.   0. ...   0.   0.   0.]
 [  1.   1.   0. ...   0.   0.   0.]]


In [413]:
# One-hot encode the labels.
y_train = keras.utils.np_utils.to_categorical(y_train)
y_validation = keras.utils.np_utils.to_categorical(y_validation)

class_weight = {0: np.sum(y_train[:,0]),
                1: np.sum(y_train[:,1])}

print(class_weight)

In [414]:
y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [415]:
# don't forget to NORMALIZE
train_mean_array = np.zeros(len(x_train[0]))
train_std_array = np.zeros(len(x_train[0]))
std_nonzero_indices = []
for j in range(len(x_train[0])):
    train_mean_array[j] = np.mean(x_train[:,j])
    train_std_array[j] = np.std(x_train[:,j])
    if train_std_array[j] != 0:
        std_nonzero_indices.append(j)
        x_train[:,j] = \
            np.divide(x_train[:,j] - train_mean_array[j],
                      train_std_array[j])
        x_validation[:,j] = \
            np.divide(x_validation[:,j] - np.mean(x_validation[:,j]),
                      np.std(x_validation[:,j]))

  del sys.path[0]


In [416]:
x_train = x_train[:, std_nonzero_indices]
x_validation = x_validation[:, std_nonzero_indices]

In [417]:
print(x_train)

[[-0.61795939  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939 -2.60061808 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 ...
 [ 0.57602003  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939  0.38451965 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]
 [-0.61795939 -2.60061808 -0.07889334 ... -0.10360887 -0.10328175
  -0.10426029]]


In [418]:
print(y_train)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [419]:
## Create the model here given the constraints in the problem.
model = Sequential()
model.add(Dense(1000))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(1000))
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('softmax'))

In [420]:
# my_callbacks = [EarlyStopping(monitor='auc_roc', patience=300, verbose=1, mode='max')]

In [421]:
# For a multi-class classification problem
model.compile(loss='mse',
               optimizer='rmsprop', metrics=['accuracy', auc_roc])

In [422]:
# fit = model.fit(x_train, y_train, batch_size=64, epochs=20,
#     verbose=1, callbacks=my_callbacks)
fit = model.fit(x_train, y_train, batch_size=64, epochs=5,
    verbose=1, class_weight=class_weight)

NameError: name 'class_weight' is not defined

In [None]:
# why don't we take a look at the layers and outputs
# note: `None` in the first dimension means it can take any batch_size!
for i in range(len(model.layers)):
    layer = model.layers[i]
    print(layer)
    print(layer.output_shape)

In [None]:
# our model has some # of parameters:
model.count_params()

In [None]:
## Printing a summary of the layers and weights in the model.
model.summary()

In [None]:
# note that our model outputs two eval params:
# 1. loss (categorical cross-entropy)
# 2. accuracy
model.metrics_names

In [None]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
train_score = model.evaluate(x=x_train, y=y_train, verbose=0)
print('Train score:', train_score[0])
print('Train accuracy:', train_score[1])
print('Train AUC:', train_score[2])

In [None]:
y_output_train = model.predict(x_train, batch_size=None, verbose=0, steps=None)

In [None]:
for i in range(len(y_output_train)):
    y_output_train[i] = [i, y_output_train[i][1]]
np.savetxt("2008_train_output.csv", y_output_train)

In [None]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
validation_score = model.evaluate(x=x_validation, y=y_validation, verbose=0)
print('Validation score:', validation_score[0])
print('Validation accuracy:', validation_score[1])
print('Validation AUC:', validation_score[2])

# Test Output

In [None]:
X_test = load_data("test_2008.csv")
ids = X_test[:,0]

x_test = X[:, 3:-1]
y_test = X[:, -1]

In [None]:
y_test = keras.utils.np_utils.to_categorical(y_test)

In [None]:
# don't forget to NORMALIZE
# std_nonzero_indices = []
for j in range(len(x_test[0])):
    test_std = np.std(x_test[:,j])
    if test_std != 0:
        # std_nonzero_indices.append(j)
        x_test[:,j] = \
            np.divide(x_test[:,j] - np.mean(x_test[:,j]),
                      np.std(x_test[:,j]))

In [None]:
x_test = x_test[:, std_nonzero_indices]

In [None]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
test_score = model.evaluate(x=x_test, y=y_test, verbose=0)
print('Test score:', test_score[0])
print('Test accuracy:', test_score[1])
print('Test AUC:', test_score[2])

In [None]:
help(model.predict)

In [None]:
y_output = model.predict(x_test, batch_size=None, verbose=0, steps=None)

In [None]:
for i in range(len(y_output)):
    y_output[i] = [i, y_output[i][1]]
np.savetxt("2008_submission.csv", y_output)