# CS 155 Miniproject 1 (Random Forest)

In [1]:
import matplotlib.pyplot as plt
import numpy as np

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization
from keras import regularizers

# For the AUC metric
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification
from keras.utils import np_utils
from keras.callbacks import Callback, EarlyStopping

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

Load the data and divide it into training and validation sets:

In [3]:
X = load_data("train_2008.csv")
N = len(X)

data = X[:, 3:-1]
label = X[:, -1]

train_percent = 70.
train_size = int(N * train_percent / 100)

random_order = np.random.permutation(np.arange(N))

x_train = data[random_order[0:train_size]]
y_train = label[random_order[0:train_size]]
x_validation = data[random_order[train_size:]]
y_validation = label[random_order[train_size:]]

In [4]:
# print(N)
# print(train_size)
print(np.shape(X))
# print(X)

(64667, 383)


Normalize the Data

In [5]:
print(x_train)

[[  1. 201.   0. ...   0.   0.   0.]
 [  2. 201.   0. ...   0.   0.   0.]
 [  1. 201.   0. ...   0.   0.   0.]
 ...
 [  1. 201.   0. ...   0.   0.   0.]
 [  4. 201.   0. ...   0.   0.   0.]
 [  4. 201.   0. ...   0.   0.   0.]]


In [6]:
# # One-hot encode the labels.
# y_train = keras.utils.np_utils.to_categorical(y_train)
# y_validation = keras.utils.np_utils.to_categorical(y_validation)

# class_weight = {0: np.sum(y_train[:,0]),
#                 1: np.sum(y_train[:,1])}

# print(class_weight)

In [7]:
y_train

array([0., 0., 1., ..., 0., 0., 1.])

In [8]:
# don't forget to NORMALIZE
train_mean_array = np.zeros(len(x_train[0]))
train_std_array = np.zeros(len(x_train[0]))
std_nonzero_indices = []
for j in range(len(x_train[0])):
    train_mean_array[j] = np.mean(x_train[:,j])
    train_std_array[j] = np.std(x_train[:,j])
    if train_std_array[j] != 0:
        std_nonzero_indices.append(j)
        x_train[:,j] = \
            np.divide(x_train[:,j] - train_mean_array[j],
                      train_std_array[j])
    if np.std(x_validation[:,j]) != 0:
        x_validation[:,j] = \
            np.divide(x_validation[:,j] - np.mean(x_validation[:,j]),
                      np.std(x_validation[:,j]))

In [9]:
x_train = x_train[:, std_nonzero_indices]
x_validation = x_validation[:, std_nonzero_indices]

In [10]:
print(x_train)

[[-0.61653788  0.38236159 -0.07875145 ... -0.10382716 -0.10360948
  -0.10458502]
 [ 0.5818563   0.38236159 -0.07875145 ... -0.10382716 -0.10360948
  -0.10458502]
 [-0.61653788  0.38236159 -0.07875145 ... -0.10382716 -0.10360948
  -0.10458502]
 ...
 [-0.61653788  0.38236159 -0.07875145 ... -0.10382716 -0.10360948
  -0.10458502]
 [ 2.97864467  0.38236159 -0.07875145 ... -0.10382716 -0.10360948
  -0.10458502]
 [ 2.97864467  0.38236159 -0.07875145 ... -0.10382716 -0.10360948
  -0.10458502]]


In [11]:
print(y_train)

[0. 0. 1. ... 0. 0. 1.]


# Model Creation/Fitting

In [12]:
# rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_features=20,
                           max_depth=15)

In [13]:
fit = rf.fit(x_train, y_train)

In [14]:
# our model has some # of parameters:
# model.count_params()

In [15]:
# note that our model outputs two eval params:
# 1. loss (categorical cross-entropy)
# 2. accuracy
# model.metrics_names

In [16]:
y_output_train = rf.predict(x_train)

In [17]:
y_output_train

array([0.06861897, 0.07899453, 0.31805216, ..., 0.17205633, 0.19828698,
       0.47366449])

In [18]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
train_score = rf.score(x_train, y_train)
print('Train score:', train_score)
train_auc = roc_auc_score(y_train, y_output_train)
print('Train auc:', train_auc)

Train score: 0.45609601058298277
Train auc: 0.9329123351609676


In [19]:
y_train

array([0., 0., 1., ..., 0., 0., 1.])

In [20]:
y_output_lines = []
for i in range(len(y_output_train)):
    y_output_lines.append([i, y_output_train[i], y_train[i]])
np.savetxt("2008_train_output.csv", y_output_lines, fmt='%d,%f,%f')

In [21]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
validation_score = rf.score(x_validation, y_validation)
print('Validation score:', validation_score)
y_output_validation = rf.predict(x_validation)
validation_auc = roc_auc_score(y_validation, y_output_validation)
print('Validation auc:', validation_auc)

Validation score: 0.19017980239312549
Validation auc: 0.7821887294678014


In [22]:
y_output_lines = []
for i in range(len(y_output_validation)):
    y_output_lines.append([i, y_output_validation[i], y_validation[i]])
np.savetxt("2008_validation_output.csv", y_output_lines, fmt='%d,%f,%f')

In [23]:
# np.mean(cross_val_score(rf, data, label, cv=2, scoring="roc_auc"))

In [24]:
# np.mean(cross_val_score(rf, data, label, cv=3, scoring="roc_auc"))

In [25]:
# np.mean(cross_val_score(rf, data, label, cv=4, scoring="roc_auc"))

In [26]:
# np.mean(cross_val_score(rf, data, label, cv=5, scoring="roc_auc"))

In [27]:
# np.mean(cross_val_score(rf, data, label, cv=6, scoring="roc_auc"))

In [28]:
# np.mean(cross_val_score(rf, data, label, cv=7, scoring="roc_auc"))

In [29]:
# np.mean(cross_val_score(rf, data, label, cv=8, scoring="roc_auc"))

In [30]:
# np.mean(cross_val_score(rf, data, label, cv=9, scoring="roc_auc"))

In [31]:
# np.mean(cross_val_score(rf, data, label, cv=10, scoring="roc_auc"))

# Test Output 2008

In [32]:
X_test = load_data("test_2008.csv")
ids = X_test[:,0]

x_test = X_test[:, 3:]
# x_test = X[:, 3:-1]
# y_test = X[:, -1]

In [33]:
# y_test = keras.utils.np_utils.to_categorical(y_test)

In [34]:
# don't forget to NORMALIZE
# std_nonzero_indices = []
for j in range(len(x_test[0])):
    test_std = np.std(x_test[:,j])
    if test_std != 0:
        # std_nonzero_indices.append(j)
        x_test[:,j] = \
            np.divide(x_test[:,j] - np.mean(x_test[:,j]),
                      np.std(x_test[:,j]))

In [35]:
x_test = x_test[:, std_nonzero_indices]

In [36]:
# ## Printing the accuracy of the model, according to the loss function specified in model.compile above.
# test_score = model.evaluate(x=x_test, y=y_test, verbose=0)
# print('Test score:', test_score[0])
# print('Test accuracy:', test_score[1])
# print('Test AUC:', test_score[2])

In [37]:
# help(model.predict)

In [38]:
y_output = rf.predict(x_test)

In [40]:
y_output_lines2 = []
for i in range(len(y_output)):
    y_output_lines2.append([i, y_output[i]])
np.savetxt("2008_submission.csv", y_output_lines2, fmt='%d,%f')

# Test Output 2012

In [41]:
X_test2 = load_data("test_2012.csv")
ids2 = X_test2[:,0]

x_test2 = X_test2[:, 3:]
# x_test = X[:, 3:-1]
# y_test = X[:, -1]

In [42]:
# y_test = keras.utils.np_utils.to_categorical(y_test)

In [43]:
# don't forget to NORMALIZE
# std_nonzero_indices = []
for j in range(len(x_test2[0])):
    test_std = np.std(x_test2[:,j])
    if test_std != 0:
        # std_nonzero_indices.append(j)
        x_test2[:,j] = \
            np.divide(x_test2[:,j] - np.mean(x_test2[:,j]),
                      np.std(x_test2[:,j]))

In [44]:
x_test2 = x_test2[:, std_nonzero_indices]

In [45]:
# ## Printing the accuracy of the model, according to the loss function specified in model.compile above.
# test_score = model.evaluate(x=x_test, y=y_test, verbose=0)
# print('Test score:', test_score[0])
# print('Test accuracy:', test_score[1])
# print('Test AUC:', test_score[2])

In [46]:
# help(model.predict)

In [47]:
y_output2 = rf.predict(x_test2)

In [48]:
y_output_lines3 = []
for i in range(len(y_output2)):
    y_output_lines3.append([i, y_output2[i]])
np.savetxt("2012_submission.csv", y_output_lines3, fmt='%d,%f')